#关闭警告信息
import warnings
warnings.filterwarnings('ignore')
读取数据并对数据进行必要的检查,包括缺失值、数据类型。
#读取数据
import pandas as pd
Data=pd.read_pickle('DNA methylation data/RA DNA methylation/RA_Methylation_Data.pkl')
Data.info()
<class 'pandas.core.frame.DataFrame'> Index: 6425 entries, 7462 to 7002 Columns: 100008 entries, sample_id to DiseaseEncoder dtypes: float64(100002), int64(2), object(4) memory usage: 4.8+ GB
#检查数据空缺值
print(Data.isnull().sum().sum())
136
#处理空缺数据
Data['gender'].fillna('M',inplace=True)
Data['GenderEncoder'].fillna(0,inplace=True)
#检查空缺数据处理结果
print(Data.isnull().sum().sum())
0
#查看数据结构
Data.head(10)
| sample_id | cg00050873 | cg00212031 | cg00213748 | cg00214611 | cg00455876 | cg01707559 | cg02004872 | cg02011394 | cg02050847 | ... | cg12794168 | cg12799119 | cg12848808 | age | gender | sample_type | disease | GenderEncoder | sample_type_encoder | DiseaseEncoder | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 7462 | train17463 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 1.482415 | -3.938986 | -1.687774 | 39.0 | F | disease tissue | rheumatoid arthritis | 1.0 | 1 | 1 |
| 7463 | train17464 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.890330 | -3.619579 | -1.672671 | 28.0 | F | disease tissue | rheumatoid arthritis | 1.0 | 1 | 1 |
| 7464 | train17465 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 2.049755 | -3.886935 | -2.100192 | 68.0 | F | disease tissue | rheumatoid arthritis | 1.0 | 1 | 1 |
| 7465 | train17466 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.832866 | -4.112908 | -2.324893 | 30.0 | F | disease tissue | rheumatoid arthritis | 1.0 | 1 | 1 |
| 7466 | train17467 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 1.564056 | -3.701353 | -1.814692 | 69.0 | F | disease tissue | rheumatoid arthritis | 1.0 | 1 | 1 |
| 7467 | train17468 | 1.687774 | -4.247583 | 0.0 | -3.744756 | 0.0 | -2.185284 | -3.314031 | 2.570129 | 3.744756 | ... | 1.482415 | -3.580953 | -1.982282 | 40.0 | M | disease tissue | rheumatoid arthritis | 0.0 | 1 | 1 |
| 7468 | train17469 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 1.247949 | -3.886935 | -2.069693 | 47.0 | F | disease tissue | rheumatoid arthritis | 1.0 | 1 | 1 |
| 7469 | train17470 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 1.790011 | -4.051632 | -1.918093 | 53.0 | F | disease tissue | rheumatoid arthritis | 1.0 | 1 | 1 |
| 7470 | train17471 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 1.348848 | -4.051632 | -1.936120 | 62.0 | F | disease tissue | rheumatoid arthritis | 1.0 | 1 | 1 |
| 7471 | train17472 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 1.300641 | -4.322159 | -1.848299 | 49.0 | F | disease tissue | rheumatoid arthritis | 1.0 | 1 | 1 |
10 rows × 100008 columns
使用PCA算法计算保留95%和99%信息的维度,并将数据降维至3维可视化数据分布。
#提取甲基化数据并进行PCA处理
Methylation=Data.iloc[:,1:-7]
print(Methylation.shape)
(6425, 100000)
#执行PCA算法-计算保留99%信息时的维度
from sklearn.decomposition import PCA
Methylaion_PCA=PCA(n_components=0.99)
PCA_Methylation_99=Methylaion_PCA.fit_transform(Methylation) #数据降维
print(PCA_Methylation_99.shape)
(6425, 3981)
#执行PCA算法-计算保留99%信息时的维度
from sklearn.decomposition import PCA
Methylaion_PCA=PCA(n_components=0.95)
PCA_Methylation_95=Methylaion_PCA.fit_transform(Methylation) #数据降维
print(PCA_Methylation_95.shape)
(6425, 2514)
#计算可解释方差比
import numpy as np
import matplotlib.pyplot as plt
Explain_variance=np.cumsum(Methylaion_PCA.explained_variance_ratio_)
#绘图
Explain_variance_plt=plt.figure(dpi=300)
Epl_variance=Explain_variance_plt.add_subplot(111)
Epl_variance.set_title('Explainable variance ratio curve')
Epl_variance.grid(color='black',linestyle='-.',alpha=0.2)
Epl_variance.plot(np.arange(1,len(Explain_variance)+1,1,dtype=int),Explain_variance,color='orange',linestyle='-')
#添加95%可解释方差曲线
Epl_variance.scatter(x=2514,y=0.95,color='green',marker='o')
Epl_variance.axhline(y=0.95, color='green', linestyle='-.')
Epl_variance.axvline(x=2514, color='green', linestyle='-.', label='Retain 95 per cent of explainable variance')
#添加99%可解释方差曲线
Epl_variance.scatter(3981,0.99,color='red',marker='o')
Epl_variance.axhline(y=0.99, color='red', linestyle='-.')
Epl_variance.axvline(x=3981, color='red', linestyle='-.', label='Retain 99 per cent of explainable variance')
Epl_variance.set_xlabel('n_components')
Epl_variance.set_ylabel('explained_variance_ratio')
plt.legend()
plt.show()
特征选择:执行低方差过滤、高相关过滤和Lossa回归进行特征选择。
#对数据采取低方差过滤算法
from pandas import DataFrame
from sklearn.feature_selection import VarianceThreshold
Variance_Selector=VarianceThreshold(threshold=0.5)
VarSele_Data=Variance_Selector.fit_transform(Methylation) #执行低方差过滤
VarSele_Data=DataFrame(VarSele_Data)
VarSele_Data.columns=Variance_Selector.get_feature_names_out()
VarSele_Data.shape
(6425, 63403)
#构建特征方差数据
VarianseData=DataFrame()
VarianseData['MetaBolite']=Variance_Selector.feature_names_in_
VarianseData['Variances']=Variance_Selector.variances_
VarianseData.shape
(100000, 2)
#输出为Eecel文件
VarianseData.to_excel('/mnt/workspace/Analysis Data/VarianseData.xlsx','UTF-8')
#执行基于F分布的方差分析进行高相关过滤
from sklearn.feature_selection import SelectKBest, f_classif
Selectk=SelectKBest(score_func=f_classif,k=3981) #保留99%可解释信息
FselectorData=Selectk.fit_transform(VarSele_Data,Data.loc[:,'DiseaseEncoder'])
FselectorData=pd.DataFrame(FselectorData)
feature_nameindex=Selectk.get_support(indices=True) #获取特征索引
feature_names=VarSele_Data.columns #获取特征名
Kfeature_names=[feature_names[i] for i in feature_nameindex]
FselectorData.columns=Kfeature_names
FselectorData.shape
(6425, 3981)
#获取相关系数信息
KBestF_info=DataFrame()
KBestF_info['Feature']=Kfeature_names #载入特征
KBestF_info['scores']=Selectk.scores_[:3981]
KBestF_info['P value']=Selectk.pvalues_[:3981]
KBestF_info.shape
(3981, 3)
#输出相关系数数据
KBestF_info.to_excel('/mnt/workspace/Analysis Data/CorrData.xlsx','UTF-8')
#采用Lasso算法进行特征选择
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
Target=Data.loc[:,'DiseaseEncoder']
# 数据标准化
scaler=StandardScaler()
X_scaled=scaler.fit_transform(FselectorData)
# 拆分数据集
X_train,X_test,y_train,y_test=train_test_split(X_scaled,Target, test_size=0.3, random_state=2024)
lasso_model = LogisticRegression(penalty='l1',solver='liblinear',C=0.1,random_state=2024,max_iter=10000) # 设置Lasso模型
lasso_model.fit(X_train,y_train) # 训练模型
LogisticRegression(C=0.1, max_iter=10000, penalty='l1', random_state=2024,
solver='liblinear')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(C=0.1, max_iter=10000, penalty='l1', random_state=2024,
solver='liblinear')#评估Lasso模型性能
print('Lasso模型训练集Accuracy为:',accuracy_score(y_train,lasso_model.predict(X_train)))
print('Lasso模型测试集Accuracy为:',accuracy_score(y_test,lasso_model.predict(X_test)))
Lasso模型训练集Accuracy为: 0.9968868134311764 Lasso模型测试集Accuracy为: 0.9942946058091287
#查看模型混淆矩阵
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
def Display_ConfusionMatrix(model,data,target):
ConfusionMatrix_result=confusion_matrix(target,model.predict(data),labels=[0,1]) #计算混淆举证
Display_ConfusionMatrix=ConfusionMatrixDisplay(ConfusionMatrix_result,display_labels=['control','rheumatoid arthritis'])
Display_ConfusionMatrix.plot(include_values=True, cmap='viridis', xticks_rotation='horizontal', values_format='d', ax=None)
plt.title('Confusion Matrix')
plt.show()
Display_ConfusionMatrix(lasso_model,X_train,y_train) #训练集
Display_ConfusionMatrix(lasso_model,X_test,y_test) #测试集
#提取特征并进行特征选择
import numpy as np
LassoFeature_Index=np.where(lasso_model.coef_!=0)[1]
LassoFeature=FselectorData.iloc[:,LassoFeature_Index]
LassoFeature.shape
(6425, 59)
#输出lossa模型
import joblib
joblib.dump(lasso_model,'/mnt/workspace/Analysis Model/LossaModel.pkl')
['/mnt/workspace/Analysis Model/LossaModel.pkl']
使用Lasso验证DNA甲基化位点特征选择后的结果.
#使用特征选择后的模型验证模型精度-输入[batchs,70],输出[batch,1]
X_TrainEv,X_testEv,y_trainEv,y_testEv=train_test_split(LassoFeature,Target, test_size=0.3, random_state=2024)
lasso_modelEvaluateD=LogisticRegression(penalty='l1',solver='liblinear',C=0.1,random_state=2024,max_iter=10000)
lasso_modelEvaluateD.fit(X_TrainEv,y_trainEv)
LogisticRegression(C=0.1, max_iter=10000, penalty='l1', random_state=2024,
solver='liblinear')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(C=0.1, max_iter=10000, penalty='l1', random_state=2024,
solver='liblinear')#使用特征选择后的数据集再次训练模型评估特征选择效果-10折交叉验证
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold,cross_val_score
from sklearn.model_selection import cross_validate
def Model_Evelate_CV(Model,Data,Target):
cv=KFold(n_splits=10, shuffle=True, random_state=2025)
kv_scores =cross_validate(Model,Data,Target,cv=cv,scoring='accuracy',
return_train_score=True)
print('Fit_time:',kv_scores['fit_time'])
print('Mean Fit_time:',kv_scores['fit_time'].mean())
print('score_time:',kv_scores['score_time'])
print('Mean score_time:',kv_scores['score_time'].mean())
print('train_score:',kv_scores['train_score'])
print('Mean train_score:',kv_scores['train_score'].mean())
print('test_score:',kv_scores['test_score'])
print('Mean test_score:',kv_scores['test_score'].mean())
#用特征选择后的数据集再次训练模型评估特征选择效果-10折交叉验证
lasso_modelEvaluate=LogisticRegression(penalty='l1',solver='liblinear',C=0.1,random_state=2024,max_iter=10000) # 设置Lasso模型
Model_Evelate_CV(Model=lasso_modelEvaluate,Data=LassoFeature,Target=Target)
Fit_time: [0.13763237 0.15320015 0.17018366 0.14581394 0.12758136 0.13371754 0.1441164 0.13194776 0.13309407 0.13670778] Mean Fit_time: 0.14139950275421143 score_time: [0.00301838 0.00292587 0.00316906 0.00275707 0.003124 0.00312471 0.00284743 0.00251055 0.0025866 0.00309825] Mean score_time: 0.0029161930084228515 train_score: [0.99688689 0.99740574 0.99723279 0.99723279 0.99723279 0.99740619 0.99740619 0.99706035 0.99671451 0.99757911] Mean train_score: 0.9972157356217635 test_score: [0.99377916 0.99377916 0.99533437 0.99688958 0.99688958 0.99221184 0.99376947 0.99688474 0.99844237 0.9953271 ] Mean test_score: 0.9953307364718537
#训练集混淆举证
Display_ConfusionMatrix(lasso_modelEvaluateD,X_TrainEv,y_trainEv)
#测试集混淆举证
Display_ConfusionMatrix(lasso_modelEvaluateD,X_testEv,y_testEv)
#整合并输出特征选择结果
CG_Feature=Data.iloc[:,LassoFeature_Index]
DNA_Methylstion_Feature=pd.concat([CG_Feature,Data.iloc[:,-7:]],axis=1)
DNA_Methylstion_Feature.shape
(6425, 66)
#输出特征选择甲基化数据
DNA_Methylstion_Feature.to_pickle('/mnt/workspace/DNA methylation data/RA DNA methylation/RA_Methylation_Feature.pkl')
DNA_Methylstion_Feature.to_excel('/mnt/workspace/DNA methylation data/RA DNA methylation/RA_Methylation_Feature.xlsx','UTF-8')
DNA_Methylstion_Feature.to_csv('/mnt/workspace/DNA methylation data/RA DNA methylation/RA_Methylation_Feature.csv')
使用机器学习提取关键甲基化位点
#读取特征选择后的甲基化数据
import pandas as pd
MethylationFeature=pd.read_csv('/mnt/workspace/RA_Methylation_Feature.csv')
MethylationFeature.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 6425 entries, 0 to 6424 Data columns (total 67 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 6425 non-null int64 1 cg00455876 6425 non-null float64 2 cg05544622 6425 non-null float64 3 cg00423014 6425 non-null float64 4 cg00478198 6425 non-null float64 5 cg00776430 6425 non-null float64 6 cg01938887 6425 non-null float64 7 cg02714462 6425 non-null float64 8 cg02896361 6425 non-null float64 9 cg02971902 6425 non-null float64 10 cg03601619 6425 non-null float64 11 cg04029664 6425 non-null float64 12 cg04302300 6425 non-null float64 13 cg04699313 6425 non-null float64 14 cg05257372 6425 non-null float64 15 cg05443523 6425 non-null float64 16 cg06411441 6425 non-null float64 17 cg07066594 6425 non-null float64 18 cg08141049 6425 non-null float64 19 cg08311689 6425 non-null float64 20 cg09357232 6425 non-null float64 21 cg10315562 6425 non-null float64 22 cg11704979 6425 non-null float64 23 cg11923788 6425 non-null float64 24 cg12944030 6425 non-null float64 25 cg14061423 6425 non-null float64 26 cg14745925 6425 non-null float64 27 cg15388264 6425 non-null float64 28 cg16078210 6425 non-null float64 29 cg17482649 6425 non-null float64 30 cg17488844 6425 non-null float64 31 cg17794813 6425 non-null float64 32 cg19248041 6425 non-null float64 33 cg19532714 6425 non-null float64 34 cg19603571 6425 non-null float64 35 cg19784262 6425 non-null float64 36 cg22055524 6425 non-null float64 37 cg22219869 6425 non-null float64 38 cg22221554 6425 non-null float64 39 cg22561883 6425 non-null float64 40 cg23299576 6425 non-null float64 41 cg23925558 6425 non-null float64 42 cg23993836 6425 non-null float64 43 cg24627619 6425 non-null float64 44 cg26012731 6425 non-null float64 45 cg26039926 6425 non-null float64 46 cg26936230 6425 non-null float64 47 cg00097228 6425 non-null float64 48 cg00152515 6425 non-null float64 49 cg00306390 6425 non-null float64 50 cg00342358 6425 non-null float64 51 cg00397635 6425 non-null float64 52 cg00455424 6425 non-null float64 53 cg00534295 6425 non-null float64 54 cg00543485 6425 non-null float64 55 cg00581848 6425 non-null float64 56 cg01393939 6425 non-null float64 57 cg01439876 6425 non-null float64 58 cg01446477 6425 non-null float64 59 cg01515508 6425 non-null float64 60 age 6425 non-null float64 61 gender 6425 non-null object 62 sample_type 6425 non-null object 63 disease 6425 non-null object 64 GenderEncoder 6425 non-null float64 65 sample_type_encoder 6425 non-null int64 66 DiseaseEncoder 6425 non-null int64 dtypes: float64(61), int64(3), object(3) memory usage: 3.3+ MB
MethylationFeature.isnull().sum().sum() #检测空缺数据
0
#统计性别数据
from collections import Counter
print(Counter(MethylationFeature['gender']))
MethylationFeature['gender'].hist()
Counter({'F': 3430, 'M': 2995})
<Axes: >
#统计疾病数据
print(Counter(MethylationFeature['disease']))
MethylationFeature['disease'].hist()
Counter({'control': 6266, 'rheumatoid arthritis': 159})
<Axes: >
进行机器学习分析,筛选类风湿性关节炎特征甲基化位点。
Methylation=MethylationFeature.iloc[:,1:-7] #甲基化数据
MapData=MethylationFeature.loc[:,['age','GenderEncoder','DiseaseEncoder']] #其他附加数据
MLData=pd.concat([Methylation,MapData],axis=1) #数据合并
MLData.shape
(6425, 62)
正常样本与疾病样本总量差异较大,使用数据重采样算法解决数据类别分布不均衡问题。
#查看模型混淆矩阵
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
def Display_ConfusionMatrix(model,data,target):
ConfusionMatrix_result=confusion_matrix(target,model.predict(data),labels=[0,1]) #计算混淆举证
Display_ConfusionMatrix=ConfusionMatrixDisplay(ConfusionMatrix_result,display_labels=['control','rheumatoid arthritis'])
Display_ConfusionMatrix.plot(include_values=True, cmap='viridis', xticks_rotation='horizontal', values_format='d', ax=None)
plt.title('Confusion Matrix')
plt.show()
#计算测试集PR曲线
from pandas import DataFrame
from sklearn.metrics import precision_recall_curve,accuracy_score
def PR_Curve(Model,Data,Label):
X_train,X_test,y_train,y_test=train_test_split(Data,Label,train_size=0.7,random_state=2024) #划分数据集
predict_score=Model.predict_proba(X_test)[:, 1] #获取概率值
predict=Model.predict(X_test) #获取预测标签
accuracy=accuracy_score(y_test,predict)
precision, recall, thresholds = precision_recall_curve(y_test, predict_score) #计算PR曲线
PR=DataFrame() #将PR曲线数据合并到DataFrame
#PR['thresholds']=thresholds
PR['recall']=recall
PR['precision']=precision
return PR,accuracy
#计算测试集ROC曲线
from sklearn.metrics import roc_curve, auc
def ROC_Curve(Model,Data,Label):
X_train,X_test,y_train,y_test=train_test_split(Data,Label,train_size=0.7,random_state=2024) #划分数据集
predict_score=Model.predict_proba(X_test)[:, 1] #获取概率值
fpr, tpr, thresholds = roc_curve(y_test, predict_score)
roc_auc = auc(fpr, tpr) #计算AUC
ROC=DataFrame() #将PR曲线数据合并到DataFrame
#ROC['thresholds']=thresholds
ROC['tpr']=tpr
ROC['fpr']=fpr
return ROC,roc_auc
#编写模型训练评估函数
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score,recall_score,f1_score,accuracy_score
def RunTestModel(Model,Data,Label):
X_train,X_test,y_train,y_test=train_test_split(Data,Label,train_size=0.7,random_state=2024) #划分数据集
Model.fit(X_train,y_train) #训练模型
TrainPredict=Model.predict(X_train) #计算训练集指标
TrainPrecision=precision_score(y_train,TrainPredict)
TrainRecall=recall_score(y_train,TrainPredict)
TrainF1=f1_score(y_train,TrainPredict)
TrainAcuracy=accuracy_score(y_train,TrainPredict)
print('模型训练集Precision:{0},Recall:{1},F1_Score:{2},Accuracy:{3}'.format(TrainPrecision,TrainRecall,TrainF1,TrainAcuracy))
TestPredict=Model.predict(X_test) #测试集预测结果
TestPrecision=precision_score(y_test,TestPredict)
TestRecall=recall_score(y_test,TestPredict)
TestF1=f1_score(y_test,TestPredict)
TestAccuracy=accuracy_score(y_test,TestPredict)
print('模型测试集Precision:{0},Recall:{1},F1_score:{2},Accuracy:{3}'.format(TestPrecision,TestRecall,TestF1,TestAccuracy))
print('-------------------测试集混淆举证-------------------')
Display_ConfusionMatrix(model=Model,data=X_test,target=y_test)
#切割数据
MLTestData=MLData.iloc[:,:-1]
MLTestLabel=MLData.iloc[:,-1]
#数据归一化
from pandas import DataFrame
from sklearn.preprocessing import StandardScaler,MinMaxScaler
ColumnsNames=MLTestData.columns
Stand=StandardScaler()
StandData=Stand.fit_transform(MLTestData)
MinMax=MinMaxScaler(feature_range=(0,1))
MinMaxData=MinMax.fit_transform(StandData)
MLTestData=DataFrame(MinMaxData)
MLTestData.columns=ColumnsNames
#使用随机森林算法作为衡量基准测试数据重采样算法性能
from sklearn.ensemble import RandomForestClassifier
Forest=RandomForestClassifier(random_state=2025)
RunTestModel(Model=Forest,Data=MLTestData,Label=MLTestLabel)
模型训练集Precision:1.0,Recall:1.0,F1_Score:1.0,Accuracy:1.0 模型测试集Precision:0.75,Recall:0.07142857142857142,F1_score:0.13043478260869565,Accuracy:0.979253112033195 -------------------测试集混淆举证-------------------
RF_PR,RF_Accuracy=PR_Curve(Model=Forest,Data=MLTestData,Label=MLTestLabel)
RF_ROC,RF_AUC=ROC_Curve(Model=Forest,Data=MLTestData,Label=MLTestLabel)
#输出相关数据
RF_PR.to_excel('/mnt/workspace/Analysis Data/Resample PR Data/RF_PR.xlsx')
RF_ROC.to_excel('/mnt/workspace/Analysis Data/Resample ROC Data/RF_ROC.xlsx')
测试欠采样算法。
#使用CNN算法进行欠采样
from imblearn.under_sampling import CondensedNearestNeighbour
CNN=CondensedNearestNeighbour(sampling_strategy='not minority',random_state=2024,n_jobs=-1)
CNN_TestData,CNN_TestLabel=CNN.fit_resample(MLTestData,MLTestLabel)
Counter(CNN_TestLabel)
Counter({0: 338, 1: 159})
ForestCNN=RandomForestClassifier(random_state=2025)
RunTestModel(Model=ForestCNN,Data=CNN_TestData,Label=CNN_TestLabel)
模型训练集Precision:1.0,Recall:1.0,F1_Score:1.0,Accuracy:1.0 模型测试集Precision:0.675,Recall:0.6136363636363636,F1_score:0.6428571428571429,Accuracy:0.8 -------------------测试集混淆举证-------------------
CNN_RF_PR,CNN_RF_Accuracy=PR_Curve(Model=ForestCNN,Data=CNN_TestData,Label=CNN_TestLabel)
CNN_RF_ROC,CNN_RF_AUC=ROC_Curve(Model=ForestCNN,Data=CNN_TestData,Label=CNN_TestLabel)
#输出相关数据
CNN_RF_PR.to_excel('/mnt/workspace/Analysis Data/Resample PR Data/CNN_RF_PR.xlsx')
CNN_RF_ROC.to_excel('/mnt/workspace/Analysis Data/Resample ROC Data/CNN_RF_ROC.xlsx')
#使用IHT算法进行欠采样
from imblearn.under_sampling import InstanceHardnessThreshold
IHT=InstanceHardnessThreshold(random_state=2024,cv=5,n_jobs=-1)
IHT_TestData,IHT_TestLabel=IHT.fit_resample(MLTestData,MLTestLabel)
Counter(IHT_TestLabel)
Counter({0: 3767, 1: 159})
ForestIHT=RandomForestClassifier(random_state=2025)
RunTestModel(Model=ForestIHT,Data=IHT_TestData,Label=IHT_TestLabel)
模型训练集Precision:1.0,Recall:1.0,F1_Score:1.0,Accuracy:1.0 模型测试集Precision:1.0,Recall:0.7719298245614035,F1_score:0.8712871287128713,Accuracy:0.9889643463497453 -------------------测试集混淆举证-------------------
IHT_RF_PR,IHT_RF_Accuracy=PR_Curve(Model=ForestIHT,Data=IHT_TestData,Label=IHT_TestLabel)
IHT_RF_ROC,IHT_RF_AUC=ROC_Curve(Model=ForestIHT,Data=IHT_TestData,Label=IHT_TestLabel)
#输出相关数据
IHT_RF_PR.to_excel('/mnt/workspace/Analysis Data/Resample PR Data/IHT_RF_PR.xlsx')
IHT_RF_ROC.to_excel('/mnt/workspace/Analysis Data/Resample ROC Data/IHT_RF_ROC.xlsx')
#使用NearMiss进行欠采样
from imblearn.under_sampling import NearMiss
NM=NearMiss(sampling_strategy='not minority',n_jobs=-1)
NM_TestData,NM_TestLabel=NM.fit_resample(MLTestData,MLTestLabel)
Counter(NM_TestLabel)
Counter({0: 159, 1: 159})
ForestNM=RandomForestClassifier(random_state=2025)
RunTestModel(Model=ForestNM,Data=NM_TestData,Label=NM_TestLabel)
模型训练集Precision:1.0,Recall:1.0,F1_Score:1.0,Accuracy:1.0 模型测试集Precision:0.8653846153846154,Recall:0.8490566037735849,F1_score:0.8571428571428571,Accuracy:0.84375 -------------------测试集混淆举证-------------------
NM_RF_PR,NM_RF_Accuracy=PR_Curve(Model=ForestNM,Data=NM_TestData,Label=NM_TestLabel)
NM_RF_ROC,NM_RF_AUC=ROC_Curve(Model=ForestNM,Data=NM_TestData,Label=NM_TestLabel)
#输出相关数据
NM_RF_PR.to_excel('/mnt/workspace/Analysis Data/Resample PR Data/NM_RF_PR.xlsx')
NM_RF_ROC.to_excel('/mnt/workspace/Analysis Data/Resample ROC Data/NM_RF_ROC.xlsx')
#使用NeighbourhoodCleaningRule进行欠采样
from imblearn.under_sampling import NeighbourhoodCleaningRule
NBC=NeighbourhoodCleaningRule(sampling_strategy='not minority',n_jobs=-1)
NBC_TestData,NBC_TestLabel=NBC.fit_resample(MLTestData,MLTestLabel)
Counter(NBC_TestLabel)
Counter({0: 6109, 1: 159})
ForestNBC=RandomForestClassifier(random_state=2025)
RunTestModel(Model=ForestNBC,Data=NBC_TestData,Label=NBC_TestLabel)
模型训练集Precision:1.0,Recall:1.0,F1_Score:1.0,Accuracy:1.0 模型测试集Precision:1.0,Recall:0.24390243902439024,F1_score:0.39215686274509803,Accuracy:0.9835194045720361 -------------------测试集混淆举证-------------------
NBC_RF_PR,NBC_RF_Accuracy=PR_Curve(Model=ForestNBC,Data=NBC_TestData,Label=NBC_TestLabel)
NBC_RF_ROC,NBC_RF_AUC=ROC_Curve(Model=ForestNBC,Data=NBC_TestData,Label=NBC_TestLabel)
#输出相关数据
NBC_RF_PR.to_excel('/mnt/workspace/Analysis Data/Resample PR Data/NBC_RF_PR.xlsx')
NBC_RF_ROC.to_excel('/mnt/workspace/Analysis Data/Resample ROC Data/NBC_RF_ROC.xlsx')
#使用OneSidedSelection进行欠采样
from imblearn.under_sampling import OneSidedSelection
OSS=OneSidedSelection(sampling_strategy='not minority',random_state=2024,n_jobs=-1)
OSS_TestData,OSS_TestLabel=OSS.fit_resample(MLTestData,MLTestLabel)
Counter(OSS_TestLabel)
Counter({0: 6163, 1: 159})
ForestOSS=RandomForestClassifier(random_state=2025)
RunTestModel(Model=ForestOSS,Data=OSS_TestData,Label=OSS_TestLabel)
模型训练集Precision:1.0,Recall:1.0,F1_Score:1.0,Accuracy:1.0 模型测试集Precision:1.0,Recall:0.10869565217391304,F1_score:0.19607843137254902,Accuracy:0.9783869267264101 -------------------测试集混淆举证-------------------
OSS_RF_PR,OSS_RF_Accuracy=PR_Curve(Model=ForestOSS,Data=OSS_TestData,Label=OSS_TestLabel)
OSS_RF_ROC,OSS_RF_AUC=ROC_Curve(Model=ForestOSS,Data=OSS_TestData,Label=OSS_TestLabel)
#输出相关数据
OSS_RF_PR.to_excel('/mnt/workspace/Analysis Data/Resample PR Data/OSS_RF_PR.xlsx')
OSS_RF_ROC.to_excel('/mnt/workspace/Analysis Data/Resample ROC Data/OSS_RF_ROC.xlsx')
#使用随机欠采样进行数据欠采样
from imblearn.under_sampling import RandomUnderSampler
RUS=RandomUnderSampler(sampling_strategy='not minority',random_state=2024)
RUS_TestData,RUS_TestLabel=RUS.fit_resample(MLTestData,MLTestLabel)
Counter(RUS_TestLabel)
Counter({0: 159, 1: 159})
ForestRUS=RandomForestClassifier(random_state=2025)
RunTestModel(Model=ForestRUS,Data=RUS_TestData,Label=RUS_TestLabel)
模型训练集Precision:1.0,Recall:1.0,F1_Score:1.0,Accuracy:1.0 模型测试集Precision:0.9444444444444444,Recall:0.9622641509433962,F1_score:0.9532710280373832,Accuracy:0.9479166666666666 -------------------测试集混淆举证-------------------
RUS_RF_PR,RUS_RF_Accuracy=PR_Curve(Model=ForestRUS,Data=RUS_TestData,Label=RUS_TestLabel)
RUS_RF_ROC,RUS_RF_AUC=ROC_Curve(Model=ForestRUS,Data=RUS_TestData,Label=RUS_TestLabel)
#输出相关数据
RUS_RF_PR.to_excel('/mnt/workspace/Analysis Data/Resample PR Data/RUS_RF_PR.xlsx')
RUS_RF_ROC.to_excel('/mnt/workspace/Analysis Data/Resample ROC Data/RUS_RF_ROC.xlsx')
#绘制PR曲线
import matplotlib.pyplot as plt
PR_curve=plt.figure(dpi=300)
PR_ax=PR_curve.add_subplot(111)
PR_ax.set_title('Precision-Recall curve')
PR_ax.plot(RF_PR['recall'], RF_PR['precision'],color='red',label='RandomForest=%f'%RF_Accuracy)
PR_ax.plot(CNN_RF_PR['recall'], CNN_RF_PR['precision'],color='green',label='CNN=%f'%CNN_RF_Accuracy)
PR_ax.plot(IHT_RF_PR['recall'], IHT_RF_PR['precision'],color='blue',label='IHT=%f'%IHT_RF_Accuracy)
PR_ax.plot(NBC_RF_PR['recall'], NBC_RF_PR['precision'],color='yellow',label='NBC=%f'%NBC_RF_Accuracy)
PR_ax.plot(NM_RF_PR['recall'], NM_RF_PR['precision'],color='purple',label='NM=%f'%NM_RF_Accuracy)
PR_ax.plot(OSS_RF_PR['recall'], OSS_RF_PR['precision'],color='cyan',label='OSS=%f'%OSS_RF_Accuracy)
PR_ax.plot(RUS_RF_PR['recall'], RUS_RF_PR['precision'],color='pink',label='RUS=%f'%RUS_RF_Accuracy)
PR_ax.plot([0,1],[1,0],linestyle='-.',color='black')
PR_ax.set_xlabel('Recall')
PR_ax.set_ylabel('Precision')
plt.legend(loc="best")
plt.show()
#绘制ROC曲线
import matplotlib.pyplot as plt
ROC_curve=plt.figure(dpi=300)
ROC_ax=ROC_curve.add_subplot(111)
ROC_ax.set_title('ROC Curve')
ROC_ax.plot(RF_ROC['fpr'], RF_ROC['tpr'],color='red',label='RandomForest=%f'%RF_AUC)
ROC_ax.plot(CNN_RF_ROC['fpr'], CNN_RF_ROC['tpr'],color='green',label='CNN=%f'%CNN_RF_AUC)
ROC_ax.plot(IHT_RF_ROC['fpr'], IHT_RF_ROC['tpr'],color='blue',label='IHT=%f'%IHT_RF_AUC)
ROC_ax.plot(NBC_RF_ROC['fpr'], NBC_RF_ROC['tpr'],color='yellow',label='NBC=%f'%NBC_RF_AUC)
ROC_ax.plot(NM_RF_ROC['fpr'], NM_RF_ROC['tpr'],color='purple',label='NM=%f'%NM_RF_AUC)
ROC_ax.plot(OSS_RF_ROC['fpr'], OSS_RF_ROC['tpr'],color='cyan',label='OSS=%f'%OSS_RF_AUC)
ROC_ax.plot(RUS_RF_ROC['fpr'], RUS_RF_ROC['tpr'],color='pink',label='RUS=%f'%RUS_RF_AUC)
ROC_ax.plot([0,1],[0,1],linestyle='-.',color='black')
ROC_ax.set_xlim([-0.05, 1.0])
ROC_ax.set_ylim([0, 1.05])
ROC_ax.set_xlabel('FPR')
ROC_ax.set_ylabel('TPR')
plt.legend(loc="best")
plt.show()
import joblib
Model=[Forest,ForestCNN,ForestIHT,ForestNM,ForestNBC,ForestOSS,ForestRUS]
ModelStr=['Forest.pkl','ForestCNN.pkl','ForestIHT.pkl','ForestNM.pkl','ForestNBC.pkl','ForestOSS.pkl','ForestRUS.pkl']
try:
for model,modelstr in zip(Model,ModelStr):
joblib.dump(model,'/mnt/workspace/Analysis Model/Resample Models/'+modelstr)
print('模型保存成功!')
except:
print('模型保存异常!!!')
模型保存成功!
经过混淆矩阵、PR曲线、Roc曲线综合评估,选择随机过采样算法进行数据重采样。数据重采样后进行机器学习和深度神经网络训练并采用可解释性机器学习框架提取关键甲基化位点进行特征分析。
MethylationRUS=MethylationFeature.iloc[:,1:-1]
LabelRUS=MethylationFeature.iloc[:,-1]
#合并过采样结果
RUS=RandomUnderSampler(sampling_strategy='not minority',random_state=2024)
RUS_Methylation,RUS_Label=RUS.fit_resample(MethylationRUS,LabelRUS)
Counter(RUS_Label)
Counter({0: 159, 1: 159})
RUSData=pd.concat([RUS_Methylation,RUS_Label],axis=1)
RUSData.shape
(318, 66)
#输出计算合并结果
RUSData.to_csv('/mnt/workspace/DNA methylation data/RA DNA methylation/RUS DNA Methylation Data.csv')
RUSData.to_excel('/mnt/workspace/DNA methylation data/RA DNA methylation/RUS DNA Methylation Data.xlsx','UFT-8')
读取数据进行机器学习提取特征位点。
#读取数据
import pandas as pd
MethylationData=pd.read_csv('/mnt/workspace/DNA methylation data/RA DNA methylation/RUS DNA Methylation Data.csv')
MethylationData.shape
(318, 67)
#检查空缺数据
MethylationData.isnull().sum().sum()
0
#数据选择和缩放数据
MLFeatureData=MethylationData.iloc[:,1:-6]
MLFeatureData=pd.concat([MLFeatureData,MethylationData.loc[:,'GenderEncoder']],axis=1)
MLFeatureData
| cg00455876 | cg05544622 | cg00423014 | cg00478198 | cg00776430 | cg01938887 | cg02714462 | cg02896361 | cg02971902 | cg03601619 | ... | cg00455424 | cg00534295 | cg00543485 | cg00581848 | cg01393939 | cg01439876 | cg01446477 | cg01515508 | age | GenderEncoder | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.000000 | 0.000000 | 1.202370 | 0.224898 | -1.672671 | -1.271175 | 3.201956 | -0.480968 | -0.739857 | 3.406479 | ... | -4.940737 | -3.343970 | -4.178048 | -2.616463 | 3.149987 | -2.207487 | -3.100385 | 3.701353 | 19.0000 | 1.0 |
| 1 | 0.000000 | 0.000000 | 1.019899 | 0.409549 | -1.578093 | -1.130596 | 3.744756 | -0.289957 | -0.180452 | 2.555085 | ... | -3.790034 | -6.163916 | -6.811545 | -2.468664 | 0.152263 | -2.207487 | -3.406479 | 3.543689 | 12.0000 | 1.0 |
| 2 | 0.000000 | 0.000000 | 0.241113 | 0.140201 | -0.663152 | -1.168907 | 3.507691 | -0.076021 | -0.570906 | 3.029956 | ... | -2.196336 | -3.938986 | -4.247583 | 0.000000 | 0.405382 | -2.414368 | -4.051632 | 1.557086 | 71.8192 | 1.0 |
| 3 | 0.510719 | 4.178048 | 1.839820 | -1.848299 | -2.698069 | -2.921730 | 4.807960 | -3.701353 | -3.938986 | 2.401172 | ... | -3.790034 | -4.489850 | -6.163916 | -2.732410 | 3.124904 | -2.337277 | -3.228904 | 4.178048 | 23.0000 | 0.0 |
| 4 | 0.000000 | 0.000000 | 0.837604 | 0.351505 | -0.809306 | -1.456115 | 4.402578 | 0.000000 | -0.772045 | 2.681278 | ... | -3.580953 | -3.256540 | -5.773449 | 0.000000 | 2.241551 | -2.414368 | -3.659672 | 3.938986 | 84.0000 | 1.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 313 | 0.000000 | 0.000000 | 0.000000 | 0.072017 | 0.000000 | -1.798190 | 3.374769 | 0.000000 | -0.654254 | 2.300497 | ... | -3.744756 | -4.585271 | -3.886935 | 0.000000 | 2.441207 | -2.496772 | -4.051632 | 4.402578 | 34.0000 | 1.0 |
| 314 | 0.000000 | 3.659672 | 0.000000 | -2.600812 | -3.149987 | -4.112908 | 4.585271 | 0.000000 | -4.489850 | 3.201956 | ... | -3.790034 | -4.178048 | -5.273603 | 0.000000 | 2.881228 | -2.253116 | -3.284902 | 3.993781 | 51.0000 | 0.0 |
| 315 | 0.000000 | 0.000000 | 1.130596 | 0.039997 | 0.000000 | -2.163468 | 3.007447 | 0.000000 | -0.767424 | 2.020410 | ... | -4.112908 | -4.051632 | -6.163916 | 0.000000 | 0.000000 | -2.570129 | -3.701353 | 3.472874 | 45.0000 | 1.0 |
| 316 | 0.000000 | 0.000000 | 0.000000 | 0.140201 | 0.000000 | -1.578093 | 3.149987 | 0.000000 | -0.506456 | 2.555085 | ... | -3.343970 | -3.790034 | -4.112908 | 0.000000 | 2.785946 | -2.616463 | -3.543689 | 4.178048 | 31.0000 | 1.0 |
| 317 | 0.000000 | 0.000000 | 1.672671 | -0.027996 | -1.082414 | -1.141464 | 2.648415 | 0.000000 | -0.426255 | 2.632328 | ... | -3.659672 | -4.807960 | -5.273603 | 0.000000 | 2.349789 | -2.349789 | -3.993781 | 4.940737 | 58.0000 | 1.0 |
318 rows × 61 columns
#数据缩放和标准化
from pandas import DataFrame
from sklearn.preprocessing import StandardScaler,MinMaxScaler
FeatureNames=MLFeatureData.columns #保存变量名
Standard=StandardScaler() #标准化数据
StandardData=Standard.fit_transform(MLFeatureData)
MinMax=MinMaxScaler(feature_range=(0,1)) #缩放数据
MinMaxData=MinMax.fit_transform(StandardData)
MLRunData=DataFrame(MinMaxData)
MLRunData.columns=FeatureNames
MLRunData.shape
(318, 61)
#获取标签数据
ReMLLabel=MethylationData.iloc[:,-1]
ReMLLabel
0 0
1 0
2 0
3 0
4 0
..
313 1
314 1
315 1
316 1
317 1
Name: DiseaseEncoder, Length: 318, dtype: int64
#查看模型混淆矩阵
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
def Display_ConfusionMatrix(model,data,target):
ConfusionMatrix_result=confusion_matrix(target,model.predict(data),labels=[0,1]) #计算混淆举证
Display_ConfusionMatrix=ConfusionMatrixDisplay(ConfusionMatrix_result,display_labels=['control','rheumatoid arthritis'])
Display_ConfusionMatrix.plot(include_values=True, cmap='viridis', xticks_rotation='horizontal', values_format='d', ax=None)
plt.title('Confusion Matrix')
plt.show()
#计算PR曲线
from pandas import DataFrame
from sklearn.metrics import precision_recall_curve,accuracy_score
def PR_Curve(Model,Data,Label):
X_train,X_test,y_train,y_test=train_test_split(Data,Label,train_size=0.7,random_state=2025) #划分数据集
predict_score=Model.predict_proba(X_test)[:, 1] #获取概率值
predict=Model.predict(X_test) #获取预测标签
accuracy=accuracy_score(y_test,predict)
precision, recall, thresholds = precision_recall_curve(y_test, predict_score) #计算PR曲线
PR=DataFrame() #将PR曲线数据合并到DataFrame
#PR['thresholds']=thresholds
PR['recall']=recall
PR['precision']=precision
return PR,accuracy
#计算ROC曲线
from sklearn.metrics import roc_curve, auc
def ROC_Curve(Model,Data,Label):
X_train,X_test,y_train,y_test=train_test_split(Data,Label,train_size=0.7,random_state=2025) #划分数据集
predict_score=Model.predict_proba(X_test)[:, 1] #获取概率值
fpr, tpr, thresholds = roc_curve(y_test, predict_score)
roc_auc = auc(fpr, tpr) #计算AUC
ROC=DataFrame() #将PR曲线数据合并到DataFrame
#ROC['thresholds']=thresholds
ROC['tpr']=tpr
ROC['fpr']=fpr
return ROC,roc_auc
#模型训练评估函数
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score,recall_score,f1_score,accuracy_score
def RunMLModel(Model,Data,Label):
X_train,X_test,y_train,y_test=train_test_split(Data,Label,train_size=0.7,random_state=2025) #划分数据集
Model.fit(X_train,y_train) #训练模型
TrainPredict=Model.predict(X_train) #计算训练集指标
TrainPrecision=precision_score(y_train,TrainPredict)
TrainRecall=recall_score(y_train,TrainPredict)
TrainF1=f1_score(y_train,TrainPredict)
TrainAcuracy=accuracy_score(y_train,TrainPredict)
print('模型训练集Precision:{0},Recall:{1},F1_Score:{2},Accuracy:{3}'.format(TrainPrecision,TrainRecall,TrainF1,TrainAcuracy))
TestPredict=Model.predict(X_test) #测试集预测结果
TestPrecision=precision_score(y_test,TestPredict)
TestRecall=recall_score(y_test,TestPredict)
TestF1=f1_score(y_test,TestPredict)
TestAccuracy=accuracy_score(y_test,TestPredict)
print('模型测试集Precision:{0},Recall:{1},F1_score:{2},Accuracy:{3}'.format(TestPrecision,TestRecall,TestF1,TestAccuracy))
print('-------------------测试集混淆举证-------------------')
Display_ConfusionMatrix(model=Model,data=X_test,target=y_test)
#测试Logistic回归
from sklearn.linear_model import LogisticRegression
Logistic=LogisticRegression(penalty="l2",dual=True,tol=1e-4,C=1.0,fit_intercept=True,random_state=2024,solver='liblinear',
max_iter=100,multi_class="auto",verbose=0, warm_start=False,n_jobs=-1,l1_ratio=None)
RunMLModel(Model=Logistic,Data=MLRunData,Label=ReMLLabel)
/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning. warnings.warn( /usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_logistic.py:1271: UserWarning: 'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 8. warnings.warn(
模型训练集Precision:0.8870967741935484,Recall:0.9649122807017544,F1_Score:0.9243697478991597,Accuracy:0.918918918918919 模型测试集Precision:0.88,Recall:0.9777777777777777,F1_score:0.9263157894736842,Accuracy:0.9270833333333334 -------------------测试集混淆举证-------------------
#计算PR曲线和ROC曲线值
Logistic_PR,Logistic_Accuracy=PR_Curve(Model=Logistic,Data=MLRunData,Label=ReMLLabel)
Logistic_ROC,Logistic_AUC=ROC_Curve(Model=Logistic,Data=MLRunData,Label=ReMLLabel)
#输出PR曲线和ROC曲线结果
Logistic_PR.to_excel('/mnt/workspace/Analysis Data/ML PR Data/Logistic_PR.xlsx','UTF-8')
Logistic_ROC.to_excel('/mnt/workspace/Analysis Data/ML ROC Data/Logistic_ROC.xlsx','UTF-8')
/tmp/ipykernel_295/3354021341.py:5: FutureWarning: Starting with pandas version 3.0 all arguments of to_excel except for the argument 'excel_writer' will be keyword-only.
Logistic_PR.to_excel('/mnt/workspace/Analysis Data/ML PR Data/Logistic_PR.xlsx','UTF-8')
/tmp/ipykernel_295/3354021341.py:6: FutureWarning: Starting with pandas version 3.0 all arguments of to_excel except for the argument 'excel_writer' will be keyword-only.
Logistic_ROC.to_excel('/mnt/workspace/Analysis Data/ML ROC Data/Logistic_ROC.xlsx','UTF-8')
#测试SVM算法
from sklearn.svm import SVC
SVM_Classifier=SVC(C=1.0,kernel="rbf",degree=3,gamma="scale",coef0=0.0,shrinking=True,probability=True,tol=1e-3,
cache_size=200,max_iter=-1, decision_function_shape="ovr",random_state=2024)
RunMLModel(Model=SVM_Classifier,Data=MLRunData,Label=ReMLLabel)
模型训练集Precision:0.8934426229508197,Recall:0.956140350877193,F1_Score:0.923728813559322,Accuracy:0.918918918918919 模型测试集Precision:0.9361702127659575,Recall:0.9777777777777777,F1_score:0.9565217391304348,Accuracy:0.9583333333333334 -------------------测试集混淆举证-------------------
#计算PR曲线和ROC曲线值
SVM_PR,SVM_Accuracy=PR_Curve(Model=SVM_Classifier,Data=MLRunData,Label=ReMLLabel)
SVM_ROC,SVM_AUC=ROC_Curve(Model=SVM_Classifier,Data=MLRunData,Label=ReMLLabel)
#输出PR曲线和ROC曲线结果
SVM_PR.to_excel('/mnt/workspace/Analysis Data/ML PR Data/SVM_PR.xlsx','UTF-8')
SVM_ROC.to_excel('/mnt/workspace/Analysis Data/ML ROC Data/SVM_ROC.xlsx','UTF-8')
/tmp/ipykernel_295/4264335228.py:5: FutureWarning: Starting with pandas version 3.0 all arguments of to_excel except for the argument 'excel_writer' will be keyword-only.
SVM_PR.to_excel('/mnt/workspace/Analysis Data/ML PR Data/SVM_PR.xlsx','UTF-8')
/tmp/ipykernel_295/4264335228.py:6: FutureWarning: Starting with pandas version 3.0 all arguments of to_excel except for the argument 'excel_writer' will be keyword-only.
SVM_ROC.to_excel('/mnt/workspace/Analysis Data/ML ROC Data/SVM_ROC.xlsx','UTF-8')
#测试决策树算法
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
DecisionTree=DecisionTreeClassifier(criterion='gini',splitter='best',max_depth=10,min_samples_split=2,random_state=2024)
Research_params={'max_depth':np.arange(1,20,1,dtype=int),'min_samples_split':np.arange(1,20,1,dtype=int),
'criterion':['gini','entropy']} #超参数列表
GridSearch=GridSearchCV(estimator=DecisionTree,param_grid=Research_params,cv=5,
scoring='accuracy',return_train_score=True,n_jobs=-1,verbose=0)
GridSearch.fit(MLRunData,ReMLLabel)
print('超参数搜索最佳超参数为;',GridSearch.best_params_)
print('超参数搜索最佳得分为:',GridSearch.best_score_)
Best_DTC=GridSearch.best_estimator_
超参数搜索最佳超参数为; {'criterion': 'gini', 'max_depth': 7, 'min_samples_split': 14}
超参数搜索最佳得分为: 0.88015873015873
RunMLModel(Model=Best_DTC,Data=MLRunData,Label=ReMLLabel)
模型训练集Precision:1.0,Recall:0.9649122807017544,F1_Score:0.9821428571428571,Accuracy:0.9819819819819819 模型测试集Precision:0.8913043478260869,Recall:0.9111111111111111,F1_score:0.9010989010989011,Accuracy:0.90625 -------------------测试集混淆举证-------------------
#计算PR曲线和ROC曲线值
DTC_PR,DTC_Accuracy=PR_Curve(Model=Best_DTC,Data=MLRunData,Label=ReMLLabel)
DTC_ROC,DTC_AUC=ROC_Curve(Model=Best_DTC,Data=MLRunData,Label=ReMLLabel)
#输出PR曲线和ROC曲线结果
DTC_PR.to_excel('/mnt/workspace/Analysis Data/ML PR Data/DTC_PR.xlsx','UTF-8')
DTC_ROC.to_excel('/mnt/workspace/Analysis Data/ML ROC Data/DTC_ROC.xlsx','UTF-8')
#对随机森林进行网格搜索
from sklearn.ensemble import RandomForestClassifier
RandomForest=RandomForestClassifier(n_estimators=100,criterion='gini',max_depth=7,min_samples_split=6,
bootstrap=True,min_samples_leaf=2,max_features='auto',random_state=2024)
RandomForest_params={'n_estimators':np.arange(1,40,2,dtype=int),'min_samples_leaf':np.arange(1,40,2,dtype=int),
'max_features':['auto','sqrt'],'criterion':['gini','entropy']}
GridSearch_RandomForest=GridSearchCV(estimator=RandomForest,param_grid=RandomForest_params,cv=5,
scoring='accuracy',return_train_score=False,n_jobs=-1,verbose=0)
GridSearch_RandomForest.fit(MLRunData,ReMLLabel)
print('超参数搜索最佳超参数为;',GridSearch_RandomForest.best_params_)
print('超参数搜索最佳得分为:',GridSearch_RandomForest.best_score_)
Best_Forest=GridSearch_RandomForest.best_estimator_
超参数搜索最佳超参数为; {'criterion': 'gini', 'max_features': 'sqrt', 'min_samples_leaf': 3, 'n_estimators': 29}
超参数搜索最佳得分为: 0.9526289682539681
#评估最佳模型
RunMLModel(Model=Best_Forest,Data=MLRunData,Label=ReMLLabel)
模型训练集Precision:0.9912280701754386,Recall:0.9912280701754386,F1_Score:0.9912280701754386,Accuracy:0.990990990990991 模型测试集Precision:0.8913043478260869,Recall:0.9111111111111111,F1_score:0.9010989010989011,Accuracy:0.90625 -------------------测试集混淆举证-------------------
#计算PR曲线和ROC曲线值
RondomForest_PR,RondomForest_Accuracy=PR_Curve(Model=Best_Forest,Data=MLRunData,Label=ReMLLabel)
RondomForest_ROC,RondomForest_AUC=ROC_Curve(Model=Best_Forest,Data=MLRunData,Label=ReMLLabel)
#输出PR曲线和ROC曲线结果
RondomForest_PR.to_excel('/mnt/workspace/Analysis Data/ML PR Data/RondomForest_PR.xlsx','UTF-8')
RondomForest_ROC.to_excel('/mnt/workspace/Analysis Data/ML ROC Data/RondomForest_ROC.xlsx','UTF-8')
#测试XGBoost模型
from xgboost import XGBClassifier
XGBoost=XGBClassifier(n_jobs=-1,verbosity=1,tree_method='auto',gpu_id=0,random_state=2025)
RunMLModel(Model=XGBoost,Data=MLRunData,Label=ReMLLabel)
/usr/local/lib/python3.11/dist-packages/xgboost/core.py:158: UserWarning: [10:06:40] WARNING: /workspace/src/common/error_msg.cc:45: `gpu_id` is deprecated since2.0.0, use `device` instead. E.g. device=cpu/cuda/cuda:0 warnings.warn(smsg, UserWarning)
模型训练集Precision:1.0,Recall:1.0,F1_Score:1.0,Accuracy:1.0 模型测试集Precision:0.875,Recall:0.9333333333333333,F1_score:0.9032258064516129,Accuracy:0.90625 -------------------测试集混淆举证-------------------
/usr/local/lib/python3.11/dist-packages/xgboost/core.py:158: UserWarning: [10:06:41] WARNING: /workspace/src/common/error_msg.cc:58: Falling back to prediction using DMatrix due to mismatched devices. This might lead to higher memory usage and slower performance. XGBoost is running on: cuda:0, while the input data is on: cpu. Potential solutions: - Use a data structure that matches the device ordinal in the booster. - Set the device for booster before call to inplace_predict. This warning will only be shown once. warnings.warn(smsg, UserWarning)
#计算PR曲线和ROC曲线值
XGBoost_PR,XGBoost_Accuracy=PR_Curve(Model=XGBoost,Data=MLRunData,Label=ReMLLabel)
XGBoost_ROC,XGBoost_AUC=ROC_Curve(Model=XGBoost,Data=MLRunData,Label=ReMLLabel)
#输出PR曲线和ROC曲线结果
XGBoost_PR.to_excel('/mnt/workspace/Analysis Data/ML PR Data/XGBoost_PR.xlsx','UTF-8')
XGBoost_ROC.to_excel('/mnt/workspace/Analysis Data/ML ROC Data/XGBoost_ROC.xlsx','UTF-8')
#测试LightGBM算法
from lightgbm import LGBMClassifier
LightGBM=LGBMClassifier(boosting_type='gbdt',num_leaves=31,learning_rate=0.01,n_estimators=200,
n_jobs=-1,objective='binary',metric='binary_logloss',keep_training_booster=True,
importance_type='gini',random_state=2025)
RunMLModel(Model=LightGBM,Data=MLRunData,Label=ReMLLabel)
[LightGBM] [Warning] Unknown parameter: keep_training_booster [LightGBM] [Warning] Unknown parameter: keep_training_booster [LightGBM] [Info] Number of positive: 114, number of negative: 108 [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001426 seconds. You can set `force_row_wise=true` to remove the overhead. And if memory is not enough, you can set `force_col_wise=true`. [LightGBM] [Info] Total Bins 2829 [LightGBM] [Info] Number of data points in the train set: 222, number of used features: 60 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.513514 -> initscore=0.054067 [LightGBM] [Info] Start training from score 0.054067 [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] No further splits with positive gain, best gain: -inf [LightGBM] [Warning] Unknown parameter: keep_training_booster 模型训练集Precision:0.9911504424778761,Recall:0.9824561403508771,F1_Score:0.986784140969163,Accuracy:0.9864864864864865 [LightGBM] [Warning] Unknown parameter: keep_training_booster 模型测试集Precision:0.8604651162790697,Recall:0.8222222222222222,F1_score:0.8409090909090909,Accuracy:0.8541666666666666 -------------------测试集混淆举证------------------- [LightGBM] [Warning] Unknown parameter: keep_training_booster
#计算PR曲线和ROC曲线值
LightGBM_PR,LightGBM_Accuracy=PR_Curve(Model=LightGBM,Data=MLRunData,Label=ReMLLabel)
LightGBM_ROC,LightGBM_AUC=ROC_Curve(Model=LightGBM,Data=MLRunData,Label=ReMLLabel)
#输出PR曲线和ROC曲线结果
LightGBM_PR.to_excel('/mnt/workspace/Analysis Data/ML PR Data/LightGBM_PR.xlsx','UTF-8')
LightGBM_ROC.to_excel('/mnt/workspace/Analysis Data/ML ROC Data/LightGBM_ROC.xlsx','UTF-8')
#测试CatBoost算法
from catboost import CatBoostClassifier
CatBoost=CatBoostClassifier(learning_rate=0.01,depth=6,iterations=500,thread_count=-1)
RunMLModel(Model=CatBoost,Data=MLRunData,Label=ReMLLabel)
0: learn: 0.6846040 total: 52ms remaining: 25.9s 1: learn: 0.6759460 total: 54.4ms remaining: 13.6s 2: learn: 0.6681809 total: 56.6ms remaining: 9.38s 3: learn: 0.6605169 total: 58.6ms remaining: 7.27s 4: learn: 0.6532148 total: 60.6ms remaining: 6s 5: learn: 0.6457743 total: 62.7ms remaining: 5.16s 6: learn: 0.6374344 total: 64.8ms remaining: 4.56s 7: learn: 0.6280218 total: 66.9ms remaining: 4.11s 8: learn: 0.6199500 total: 69.1ms remaining: 3.77s 9: learn: 0.6115360 total: 71.2ms remaining: 3.49s 10: learn: 0.6058233 total: 73.3ms remaining: 3.26s 11: learn: 0.5972397 total: 75.4ms remaining: 3.06s 12: learn: 0.5893468 total: 77.4ms remaining: 2.9s 13: learn: 0.5827513 total: 79.3ms remaining: 2.75s 14: learn: 0.5732835 total: 81.3ms remaining: 2.63s 15: learn: 0.5660295 total: 83.3ms remaining: 2.52s 16: learn: 0.5597356 total: 85.3ms remaining: 2.42s 17: learn: 0.5534898 total: 87.3ms remaining: 2.34s 18: learn: 0.5459730 total: 89.3ms remaining: 2.26s 19: learn: 0.5379904 total: 91.3ms remaining: 2.19s 20: learn: 0.5325712 total: 93.3ms remaining: 2.13s 21: learn: 0.5269033 total: 95.3ms remaining: 2.07s 22: learn: 0.5201728 total: 97.3ms remaining: 2.02s 23: learn: 0.5118725 total: 99.3ms remaining: 1.97s 24: learn: 0.5057485 total: 101ms remaining: 1.92s 25: learn: 0.5000583 total: 103ms remaining: 1.88s 26: learn: 0.4951915 total: 105ms remaining: 1.84s 27: learn: 0.4909327 total: 107ms remaining: 1.81s 28: learn: 0.4842253 total: 109ms remaining: 1.77s 29: learn: 0.4787442 total: 111ms remaining: 1.74s 30: learn: 0.4723441 total: 113ms remaining: 1.71s 31: learn: 0.4655272 total: 115ms remaining: 1.69s 32: learn: 0.4613936 total: 117ms remaining: 1.66s 33: learn: 0.4551019 total: 119ms remaining: 1.64s 34: learn: 0.4490847 total: 121ms remaining: 1.61s 35: learn: 0.4438207 total: 123ms remaining: 1.59s 36: learn: 0.4400339 total: 125ms remaining: 1.57s 37: learn: 0.4349726 total: 127ms remaining: 1.55s 38: learn: 0.4311170 total: 129ms remaining: 1.53s 39: learn: 0.4265656 total: 132ms remaining: 1.51s 40: learn: 0.4223958 total: 134ms remaining: 1.5s 41: learn: 0.4177472 total: 136ms remaining: 1.48s 42: learn: 0.4135435 total: 138ms remaining: 1.46s 43: learn: 0.4082540 total: 140ms remaining: 1.45s 44: learn: 0.4047728 total: 142ms remaining: 1.43s 45: learn: 0.4005827 total: 144ms remaining: 1.42s 46: learn: 0.3966837 total: 146ms remaining: 1.41s 47: learn: 0.3928355 total: 148ms remaining: 1.4s 48: learn: 0.3892496 total: 150ms remaining: 1.38s 49: learn: 0.3851300 total: 153ms remaining: 1.37s 50: learn: 0.3804993 total: 155ms remaining: 1.36s 51: learn: 0.3766210 total: 157ms remaining: 1.35s 52: learn: 0.3720525 total: 159ms remaining: 1.34s 53: learn: 0.3687800 total: 161ms remaining: 1.33s 54: learn: 0.3656090 total: 163ms remaining: 1.32s 55: learn: 0.3623371 total: 165ms remaining: 1.31s 56: learn: 0.3587285 total: 167ms remaining: 1.3s 57: learn: 0.3559665 total: 169ms remaining: 1.29s 58: learn: 0.3525551 total: 172ms remaining: 1.28s 59: learn: 0.3502077 total: 174ms remaining: 1.27s 60: learn: 0.3461714 total: 175ms remaining: 1.26s 61: learn: 0.3431533 total: 178ms remaining: 1.25s 62: learn: 0.3387627 total: 180ms remaining: 1.25s 63: learn: 0.3347702 total: 181ms remaining: 1.24s 64: learn: 0.3312209 total: 184ms remaining: 1.23s 65: learn: 0.3283847 total: 185ms remaining: 1.22s 66: learn: 0.3250618 total: 188ms remaining: 1.21s 67: learn: 0.3214170 total: 189ms remaining: 1.2s 68: learn: 0.3192749 total: 191ms remaining: 1.2s 69: learn: 0.3161495 total: 194ms remaining: 1.19s 70: learn: 0.3131933 total: 196ms remaining: 1.18s 71: learn: 0.3107692 total: 198ms remaining: 1.18s 72: learn: 0.3077083 total: 200ms remaining: 1.17s 73: learn: 0.3037429 total: 202ms remaining: 1.16s 74: learn: 0.3020042 total: 204ms remaining: 1.15s 75: learn: 0.2988086 total: 206ms remaining: 1.15s 76: learn: 0.2959006 total: 208ms remaining: 1.14s 77: learn: 0.2932654 total: 210ms remaining: 1.13s 78: learn: 0.2907278 total: 212ms remaining: 1.13s 79: learn: 0.2884350 total: 214ms remaining: 1.12s 80: learn: 0.2863270 total: 216ms remaining: 1.12s 81: learn: 0.2835952 total: 218ms remaining: 1.11s 82: learn: 0.2805909 total: 220ms remaining: 1.1s 83: learn: 0.2781724 total: 222ms remaining: 1.1s 84: learn: 0.2757498 total: 224ms remaining: 1.09s 85: learn: 0.2740257 total: 226ms remaining: 1.09s 86: learn: 0.2714518 total: 228ms remaining: 1.08s 87: learn: 0.2693197 total: 230ms remaining: 1.08s 88: learn: 0.2670931 total: 232ms remaining: 1.07s 89: learn: 0.2649242 total: 234ms remaining: 1.07s 90: learn: 0.2620552 total: 236ms remaining: 1.06s 91: learn: 0.2605563 total: 238ms remaining: 1.06s 92: learn: 0.2582206 total: 240ms remaining: 1.05s 93: learn: 0.2567592 total: 242ms remaining: 1.05s 94: learn: 0.2546125 total: 244ms remaining: 1.04s 95: learn: 0.2527879 total: 247ms remaining: 1.04s 96: learn: 0.2503207 total: 250ms remaining: 1.04s 97: learn: 0.2474673 total: 252ms remaining: 1.03s 98: learn: 0.2453658 total: 254ms remaining: 1.03s 99: learn: 0.2432069 total: 256ms remaining: 1.02s 100: learn: 0.2409437 total: 258ms remaining: 1.02s 101: learn: 0.2393565 total: 260ms remaining: 1.01s 102: learn: 0.2369362 total: 262ms remaining: 1.01s 103: learn: 0.2353270 total: 264ms remaining: 1s 104: learn: 0.2337941 total: 266ms remaining: 1s 105: learn: 0.2318099 total: 268ms remaining: 996ms 106: learn: 0.2298857 total: 270ms remaining: 992ms 107: learn: 0.2282663 total: 272ms remaining: 988ms 108: learn: 0.2264022 total: 274ms remaining: 984ms 109: learn: 0.2247401 total: 276ms remaining: 979ms 110: learn: 0.2229827 total: 278ms remaining: 975ms 111: learn: 0.2212838 total: 280ms remaining: 970ms 112: learn: 0.2198683 total: 282ms remaining: 966ms 113: learn: 0.2181729 total: 284ms remaining: 962ms 114: learn: 0.2162552 total: 286ms remaining: 958ms 115: learn: 0.2144204 total: 288ms remaining: 954ms 116: learn: 0.2127768 total: 290ms remaining: 950ms 117: learn: 0.2113537 total: 292ms remaining: 946ms 118: learn: 0.2099306 total: 294ms remaining: 943ms 119: learn: 0.2090040 total: 296ms remaining: 939ms 120: learn: 0.2073799 total: 298ms remaining: 935ms 121: learn: 0.2059616 total: 300ms remaining: 931ms 122: learn: 0.2044892 total: 302ms remaining: 927ms 123: learn: 0.2027767 total: 304ms remaining: 923ms 124: learn: 0.2017581 total: 306ms remaining: 919ms 125: learn: 0.2002303 total: 308ms remaining: 916ms 126: learn: 0.1982704 total: 311ms remaining: 912ms 127: learn: 0.1968538 total: 312ms remaining: 908ms 128: learn: 0.1947914 total: 315ms remaining: 905ms 129: learn: 0.1934204 total: 317ms remaining: 901ms 130: learn: 0.1919906 total: 319ms remaining: 898ms 131: learn: 0.1906720 total: 321ms remaining: 894ms 132: learn: 0.1893568 total: 323ms remaining: 891ms 133: learn: 0.1876434 total: 325ms remaining: 887ms 134: learn: 0.1864395 total: 327ms remaining: 884ms 135: learn: 0.1852495 total: 329ms remaining: 880ms 136: learn: 0.1840756 total: 331ms remaining: 876ms 137: learn: 0.1826665 total: 333ms remaining: 873ms 138: learn: 0.1810117 total: 335ms remaining: 870ms 139: learn: 0.1800771 total: 337ms remaining: 866ms 140: learn: 0.1789984 total: 339ms remaining: 863ms 141: learn: 0.1774704 total: 341ms remaining: 859ms 142: learn: 0.1761096 total: 343ms remaining: 856ms 143: learn: 0.1743690 total: 345ms remaining: 853ms 144: learn: 0.1732425 total: 347ms remaining: 850ms 145: learn: 0.1723546 total: 349ms remaining: 847ms 146: learn: 0.1710597 total: 351ms remaining: 843ms 147: learn: 0.1700890 total: 353ms remaining: 840ms 148: learn: 0.1689187 total: 355ms remaining: 837ms 149: learn: 0.1675242 total: 357ms remaining: 834ms 150: learn: 0.1663232 total: 359ms remaining: 830ms 151: learn: 0.1650952 total: 361ms remaining: 827ms 152: learn: 0.1638753 total: 363ms remaining: 824ms 153: learn: 0.1625407 total: 365ms remaining: 821ms 154: learn: 0.1611600 total: 367ms remaining: 817ms 155: learn: 0.1604441 total: 369ms remaining: 814ms 156: learn: 0.1594782 total: 371ms remaining: 811ms 157: learn: 0.1581012 total: 373ms remaining: 808ms 158: learn: 0.1568787 total: 375ms remaining: 805ms 159: learn: 0.1557659 total: 377ms remaining: 802ms 160: learn: 0.1550170 total: 379ms remaining: 799ms 161: learn: 0.1537192 total: 382ms remaining: 796ms 162: learn: 0.1530482 total: 384ms remaining: 793ms 163: learn: 0.1521747 total: 386ms remaining: 790ms 164: learn: 0.1512109 total: 388ms remaining: 787ms 165: learn: 0.1502230 total: 390ms remaining: 784ms 166: learn: 0.1492235 total: 392ms remaining: 781ms 167: learn: 0.1483271 total: 394ms remaining: 778ms 168: learn: 0.1472658 total: 396ms remaining: 775ms 169: learn: 0.1463962 total: 398ms remaining: 772ms 170: learn: 0.1452737 total: 400ms remaining: 769ms 171: learn: 0.1445030 total: 402ms remaining: 766ms 172: learn: 0.1436421 total: 404ms remaining: 763ms 173: learn: 0.1426043 total: 406ms remaining: 760ms 174: learn: 0.1417990 total: 408ms remaining: 757ms 175: learn: 0.1409964 total: 410ms remaining: 754ms 176: learn: 0.1400960 total: 412ms remaining: 751ms 177: learn: 0.1392026 total: 414ms remaining: 748ms 178: learn: 0.1382591 total: 416ms remaining: 745ms 179: learn: 0.1375002 total: 418ms remaining: 743ms 180: learn: 0.1365168 total: 420ms remaining: 740ms 181: learn: 0.1355343 total: 422ms remaining: 737ms 182: learn: 0.1346431 total: 424ms remaining: 734ms 183: learn: 0.1339091 total: 426ms remaining: 731ms 184: learn: 0.1330647 total: 428ms remaining: 729ms 185: learn: 0.1323216 total: 430ms remaining: 726ms 186: learn: 0.1315324 total: 432ms remaining: 723ms 187: learn: 0.1307566 total: 434ms remaining: 720ms 188: learn: 0.1299676 total: 436ms remaining: 717ms 189: learn: 0.1292623 total: 438ms remaining: 715ms 190: learn: 0.1284551 total: 440ms remaining: 712ms 191: learn: 0.1276340 total: 442ms remaining: 709ms 192: learn: 0.1271059 total: 444ms remaining: 707ms 193: learn: 0.1264945 total: 447ms remaining: 705ms 194: learn: 0.1258445 total: 449ms remaining: 702ms 195: learn: 0.1249638 total: 451ms remaining: 699ms 196: learn: 0.1241766 total: 453ms remaining: 697ms 197: learn: 0.1235573 total: 455ms remaining: 694ms 198: learn: 0.1229102 total: 457ms remaining: 691ms 199: learn: 0.1223410 total: 459ms remaining: 688ms 200: learn: 0.1216837 total: 461ms remaining: 686ms 201: learn: 0.1210594 total: 463ms remaining: 683ms 202: learn: 0.1205716 total: 465ms remaining: 681ms 203: learn: 0.1201110 total: 467ms remaining: 678ms 204: learn: 0.1195497 total: 469ms remaining: 675ms 205: learn: 0.1189083 total: 471ms remaining: 673ms 206: learn: 0.1182833 total: 473ms remaining: 670ms 207: learn: 0.1174048 total: 475ms remaining: 667ms 208: learn: 0.1167230 total: 477ms remaining: 665ms 209: learn: 0.1161288 total: 480ms remaining: 662ms 210: learn: 0.1154258 total: 482ms remaining: 660ms 211: learn: 0.1148013 total: 484ms remaining: 657ms 212: learn: 0.1143376 total: 485ms remaining: 654ms 213: learn: 0.1134639 total: 487ms remaining: 651ms 214: learn: 0.1127821 total: 490ms remaining: 649ms 215: learn: 0.1122507 total: 492ms remaining: 646ms 216: learn: 0.1113537 total: 494ms remaining: 644ms 217: learn: 0.1106922 total: 496ms remaining: 641ms 218: learn: 0.1100270 total: 498ms remaining: 639ms 219: learn: 0.1095422 total: 500ms remaining: 636ms 220: learn: 0.1090378 total: 502ms remaining: 634ms 221: learn: 0.1083041 total: 504ms remaining: 631ms 222: learn: 0.1079058 total: 506ms remaining: 628ms 223: learn: 0.1074226 total: 508ms remaining: 626ms 224: learn: 0.1065478 total: 510ms remaining: 623ms 225: learn: 0.1060412 total: 512ms remaining: 620ms 226: learn: 0.1055783 total: 514ms remaining: 618ms 227: learn: 0.1048917 total: 516ms remaining: 616ms 228: learn: 0.1042313 total: 518ms remaining: 613ms 229: learn: 0.1036835 total: 520ms remaining: 611ms 230: learn: 0.1031781 total: 522ms remaining: 608ms 231: learn: 0.1026748 total: 524ms remaining: 606ms 232: learn: 0.1021898 total: 526ms remaining: 603ms 233: learn: 0.1018561 total: 528ms remaining: 600ms 234: learn: 0.1014298 total: 530ms remaining: 598ms 235: learn: 0.1008389 total: 532ms remaining: 595ms 236: learn: 0.1002588 total: 534ms remaining: 593ms 237: learn: 0.0998770 total: 536ms remaining: 590ms 238: learn: 0.0992447 total: 538ms remaining: 588ms 239: learn: 0.0985791 total: 540ms remaining: 585ms 240: learn: 0.0980068 total: 542ms remaining: 583ms 241: learn: 0.0975926 total: 544ms remaining: 580ms 242: learn: 0.0971103 total: 546ms remaining: 578ms 243: learn: 0.0965935 total: 548ms remaining: 575ms 244: learn: 0.0959998 total: 550ms remaining: 573ms 245: learn: 0.0953505 total: 552ms remaining: 570ms 246: learn: 0.0949572 total: 554ms remaining: 568ms 247: learn: 0.0944275 total: 556ms remaining: 565ms 248: learn: 0.0939614 total: 558ms remaining: 563ms 249: learn: 0.0934980 total: 560ms remaining: 560ms 250: learn: 0.0929929 total: 562ms remaining: 558ms 251: learn: 0.0926004 total: 564ms remaining: 555ms 252: learn: 0.0922454 total: 566ms remaining: 553ms 253: learn: 0.0916853 total: 568ms remaining: 550ms 254: learn: 0.0913217 total: 571ms remaining: 548ms 255: learn: 0.0909009 total: 573ms remaining: 546ms 256: learn: 0.0904650 total: 575ms remaining: 543ms 257: learn: 0.0900383 total: 577ms remaining: 541ms 258: learn: 0.0895855 total: 579ms remaining: 538ms 259: learn: 0.0892367 total: 581ms remaining: 536ms 260: learn: 0.0889366 total: 583ms remaining: 534ms 261: learn: 0.0886641 total: 585ms remaining: 531ms 262: learn: 0.0882293 total: 587ms remaining: 529ms 263: learn: 0.0877998 total: 589ms remaining: 526ms 264: learn: 0.0872884 total: 591ms remaining: 524ms 265: learn: 0.0867746 total: 593ms remaining: 521ms 266: learn: 0.0862749 total: 595ms remaining: 519ms 267: learn: 0.0859732 total: 597ms remaining: 517ms 268: learn: 0.0854699 total: 599ms remaining: 514ms 269: learn: 0.0851456 total: 601ms remaining: 512ms 270: learn: 0.0848958 total: 603ms remaining: 509ms 271: learn: 0.0845757 total: 605ms remaining: 507ms 272: learn: 0.0841830 total: 607ms remaining: 505ms 273: learn: 0.0836906 total: 609ms remaining: 502ms 274: learn: 0.0833261 total: 611ms remaining: 500ms 275: learn: 0.0827048 total: 613ms remaining: 498ms 276: learn: 0.0823247 total: 615ms remaining: 495ms 277: learn: 0.0819396 total: 617ms remaining: 493ms 278: learn: 0.0816164 total: 619ms remaining: 490ms 279: learn: 0.0812882 total: 621ms remaining: 488ms 280: learn: 0.0809456 total: 623ms remaining: 486ms 281: learn: 0.0805355 total: 625ms remaining: 483ms 282: learn: 0.0800955 total: 627ms remaining: 481ms 283: learn: 0.0796182 total: 630ms remaining: 479ms 284: learn: 0.0793585 total: 632ms remaining: 476ms 285: learn: 0.0790722 total: 634ms remaining: 474ms 286: learn: 0.0786436 total: 636ms remaining: 472ms 287: learn: 0.0782855 total: 638ms remaining: 470ms 288: learn: 0.0777196 total: 640ms remaining: 468ms 289: learn: 0.0774455 total: 643ms remaining: 465ms 290: learn: 0.0771741 total: 645ms remaining: 463ms 291: learn: 0.0767213 total: 647ms remaining: 461ms 292: learn: 0.0763101 total: 649ms remaining: 458ms 293: learn: 0.0759696 total: 651ms remaining: 456ms 294: learn: 0.0756114 total: 653ms remaining: 454ms 295: learn: 0.0752771 total: 655ms remaining: 451ms 296: learn: 0.0749658 total: 657ms remaining: 449ms 297: learn: 0.0745448 total: 659ms remaining: 447ms 298: learn: 0.0742472 total: 661ms remaining: 444ms 299: learn: 0.0739642 total: 663ms remaining: 442ms 300: learn: 0.0736538 total: 665ms remaining: 440ms 301: learn: 0.0731854 total: 667ms remaining: 437ms 302: learn: 0.0729014 total: 669ms remaining: 435ms 303: learn: 0.0726266 total: 671ms remaining: 432ms 304: learn: 0.0723980 total: 673ms remaining: 430ms 305: learn: 0.0720825 total: 675ms remaining: 428ms 306: learn: 0.0717842 total: 677ms remaining: 425ms 307: learn: 0.0715165 total: 679ms remaining: 423ms 308: learn: 0.0712906 total: 681ms remaining: 421ms 309: learn: 0.0709843 total: 683ms remaining: 418ms 310: learn: 0.0706904 total: 685ms remaining: 416ms 311: learn: 0.0703273 total: 687ms remaining: 414ms 312: learn: 0.0699863 total: 689ms remaining: 411ms 313: learn: 0.0697272 total: 691ms remaining: 409ms 314: learn: 0.0693374 total: 693ms remaining: 407ms 315: learn: 0.0690057 total: 695ms remaining: 405ms 316: learn: 0.0687646 total: 697ms remaining: 402ms 317: learn: 0.0684913 total: 699ms remaining: 400ms 318: learn: 0.0682175 total: 701ms remaining: 398ms 319: learn: 0.0679486 total: 703ms remaining: 395ms 320: learn: 0.0675699 total: 705ms remaining: 393ms 321: learn: 0.0673991 total: 707ms remaining: 391ms 322: learn: 0.0671463 total: 709ms remaining: 389ms 323: learn: 0.0668977 total: 711ms remaining: 386ms 324: learn: 0.0665579 total: 713ms remaining: 384ms 325: learn: 0.0663293 total: 715ms remaining: 382ms 326: learn: 0.0660506 total: 718ms remaining: 380ms 327: learn: 0.0658147 total: 720ms remaining: 377ms 328: learn: 0.0655132 total: 722ms remaining: 375ms 329: learn: 0.0652340 total: 724ms remaining: 373ms 330: learn: 0.0649374 total: 726ms remaining: 371ms 331: learn: 0.0647005 total: 728ms remaining: 368ms 332: learn: 0.0644140 total: 730ms remaining: 366ms 333: learn: 0.0641768 total: 733ms remaining: 364ms 334: learn: 0.0638923 total: 735ms remaining: 362ms 335: learn: 0.0635807 total: 737ms remaining: 360ms 336: learn: 0.0633689 total: 739ms remaining: 357ms 337: learn: 0.0630989 total: 741ms remaining: 355ms 338: learn: 0.0628456 total: 743ms remaining: 353ms 339: learn: 0.0625842 total: 745ms remaining: 351ms 340: learn: 0.0622742 total: 747ms remaining: 348ms 341: learn: 0.0620714 total: 749ms remaining: 346ms 342: learn: 0.0617471 total: 751ms remaining: 344ms 343: learn: 0.0615121 total: 753ms remaining: 341ms 344: learn: 0.0612585 total: 755ms remaining: 339ms 345: learn: 0.0610550 total: 757ms remaining: 337ms 346: learn: 0.0608039 total: 759ms remaining: 335ms 347: learn: 0.0604924 total: 761ms remaining: 332ms 348: learn: 0.0601795 total: 763ms remaining: 330ms 349: learn: 0.0598725 total: 765ms remaining: 328ms 350: learn: 0.0596511 total: 767ms remaining: 326ms 351: learn: 0.0594457 total: 769ms remaining: 323ms 352: learn: 0.0592154 total: 771ms remaining: 321ms 353: learn: 0.0589629 total: 773ms remaining: 319ms 354: learn: 0.0586987 total: 775ms remaining: 317ms 355: learn: 0.0584759 total: 777ms remaining: 314ms 356: learn: 0.0582162 total: 779ms remaining: 312ms 357: learn: 0.0579987 total: 781ms remaining: 310ms 358: learn: 0.0576667 total: 783ms remaining: 308ms 359: learn: 0.0574809 total: 785ms remaining: 305ms 360: learn: 0.0573083 total: 787ms remaining: 303ms 361: learn: 0.0571192 total: 789ms remaining: 301ms 362: learn: 0.0569243 total: 791ms remaining: 299ms 363: learn: 0.0567261 total: 793ms remaining: 296ms 364: learn: 0.0564865 total: 795ms remaining: 294ms 365: learn: 0.0562840 total: 797ms remaining: 292ms 366: learn: 0.0560785 total: 799ms remaining: 290ms 367: learn: 0.0558322 total: 802ms remaining: 288ms 368: learn: 0.0556182 total: 804ms remaining: 285ms 369: learn: 0.0554483 total: 806ms remaining: 283ms 370: learn: 0.0553145 total: 808ms remaining: 281ms 371: learn: 0.0551290 total: 810ms remaining: 279ms 372: learn: 0.0548128 total: 812ms remaining: 276ms 373: learn: 0.0546286 total: 814ms remaining: 274ms 374: learn: 0.0544126 total: 816ms remaining: 272ms 375: learn: 0.0541702 total: 818ms remaining: 270ms 376: learn: 0.0540095 total: 820ms remaining: 268ms 377: learn: 0.0537582 total: 822ms remaining: 265ms 378: learn: 0.0535366 total: 825ms remaining: 263ms 379: learn: 0.0532767 total: 827ms remaining: 261ms 380: learn: 0.0530737 total: 830ms remaining: 259ms 381: learn: 0.0529433 total: 832ms remaining: 257ms 382: learn: 0.0526476 total: 835ms remaining: 255ms 383: learn: 0.0524374 total: 837ms remaining: 253ms 384: learn: 0.0522682 total: 839ms remaining: 251ms 385: learn: 0.0521002 total: 842ms remaining: 249ms 386: learn: 0.0517807 total: 845ms remaining: 247ms 387: learn: 0.0516491 total: 847ms remaining: 244ms 388: learn: 0.0513977 total: 849ms remaining: 242ms 389: learn: 0.0511981 total: 851ms remaining: 240ms 390: learn: 0.0509928 total: 853ms remaining: 238ms 391: learn: 0.0508100 total: 855ms remaining: 235ms 392: learn: 0.0506515 total: 857ms remaining: 233ms 393: learn: 0.0504879 total: 859ms remaining: 231ms 394: learn: 0.0503601 total: 861ms remaining: 229ms 395: learn: 0.0501811 total: 863ms remaining: 227ms 396: learn: 0.0499619 total: 865ms remaining: 224ms 397: learn: 0.0497344 total: 867ms remaining: 222ms 398: learn: 0.0494894 total: 869ms remaining: 220ms 399: learn: 0.0492946 total: 871ms remaining: 218ms 400: learn: 0.0491168 total: 873ms remaining: 215ms 401: learn: 0.0489381 total: 875ms remaining: 213ms 402: learn: 0.0487662 total: 877ms remaining: 211ms 403: learn: 0.0485960 total: 879ms remaining: 209ms 404: learn: 0.0484260 total: 881ms remaining: 207ms 405: learn: 0.0482357 total: 883ms remaining: 204ms 406: learn: 0.0480300 total: 885ms remaining: 202ms 407: learn: 0.0478388 total: 887ms remaining: 200ms 408: learn: 0.0477056 total: 889ms remaining: 198ms 409: learn: 0.0475431 total: 891ms remaining: 196ms 410: learn: 0.0473370 total: 893ms remaining: 193ms 411: learn: 0.0471485 total: 895ms remaining: 191ms 412: learn: 0.0470046 total: 897ms remaining: 189ms 413: learn: 0.0467841 total: 899ms remaining: 187ms 414: learn: 0.0466280 total: 901ms remaining: 185ms 415: learn: 0.0464732 total: 903ms remaining: 182ms 416: learn: 0.0462847 total: 905ms remaining: 180ms 417: learn: 0.0461002 total: 907ms remaining: 178ms 418: learn: 0.0459447 total: 909ms remaining: 176ms 419: learn: 0.0458015 total: 911ms remaining: 174ms 420: learn: 0.0456173 total: 913ms remaining: 171ms 421: learn: 0.0454063 total: 915ms remaining: 169ms 422: learn: 0.0452413 total: 917ms remaining: 167ms 423: learn: 0.0451065 total: 919ms remaining: 165ms 424: learn: 0.0449328 total: 921ms remaining: 163ms 425: learn: 0.0447209 total: 923ms remaining: 160ms 426: learn: 0.0445578 total: 925ms remaining: 158ms 427: learn: 0.0443970 total: 927ms remaining: 156ms 428: learn: 0.0441543 total: 929ms remaining: 154ms 429: learn: 0.0440567 total: 931ms remaining: 152ms 430: learn: 0.0438545 total: 933ms remaining: 149ms 431: learn: 0.0437481 total: 935ms remaining: 147ms 432: learn: 0.0435939 total: 937ms remaining: 145ms 433: learn: 0.0434315 total: 939ms remaining: 143ms 434: learn: 0.0432680 total: 941ms remaining: 141ms 435: learn: 0.0431179 total: 944ms remaining: 138ms 436: learn: 0.0429558 total: 946ms remaining: 136ms 437: learn: 0.0427794 total: 948ms remaining: 134ms 438: learn: 0.0426621 total: 950ms remaining: 132ms 439: learn: 0.0424642 total: 952ms remaining: 130ms 440: learn: 0.0422749 total: 954ms remaining: 128ms 441: learn: 0.0420968 total: 956ms remaining: 125ms 442: learn: 0.0419342 total: 958ms remaining: 123ms 443: learn: 0.0417983 total: 960ms remaining: 121ms 444: learn: 0.0416726 total: 962ms remaining: 119ms 445: learn: 0.0414811 total: 964ms remaining: 117ms 446: learn: 0.0413471 total: 966ms remaining: 115ms 447: learn: 0.0412036 total: 968ms remaining: 112ms 448: learn: 0.0410487 total: 970ms remaining: 110ms 449: learn: 0.0408936 total: 972ms remaining: 108ms 450: learn: 0.0407416 total: 974ms remaining: 106ms 451: learn: 0.0405291 total: 976ms remaining: 104ms 452: learn: 0.0404189 total: 978ms remaining: 102ms 453: learn: 0.0402506 total: 980ms remaining: 99.3ms 454: learn: 0.0401505 total: 982ms remaining: 97.1ms 455: learn: 0.0399942 total: 984ms remaining: 95ms 456: learn: 0.0398390 total: 986ms remaining: 92.8ms 457: learn: 0.0396801 total: 988ms remaining: 90.6ms 458: learn: 0.0394797 total: 990ms remaining: 88.5ms 459: learn: 0.0393357 total: 992ms remaining: 86.3ms 460: learn: 0.0392137 total: 994ms remaining: 84.1ms 461: learn: 0.0390646 total: 996ms remaining: 82ms 462: learn: 0.0389406 total: 998ms remaining: 79.8ms 463: learn: 0.0388542 total: 1s remaining: 77.6ms 464: learn: 0.0387217 total: 1s remaining: 75.4ms 465: learn: 0.0386027 total: 1s remaining: 73.3ms 466: learn: 0.0384044 total: 1.01s remaining: 71.1ms 467: learn: 0.0382599 total: 1.01s remaining: 69ms 468: learn: 0.0380873 total: 1.01s remaining: 66.8ms 469: learn: 0.0379271 total: 1.01s remaining: 64.6ms 470: learn: 0.0377469 total: 1.01s remaining: 62.5ms 471: learn: 0.0376363 total: 1.02s remaining: 60.3ms 472: learn: 0.0374950 total: 1.02s remaining: 58.1ms 473: learn: 0.0373768 total: 1.02s remaining: 56ms 474: learn: 0.0372549 total: 1.02s remaining: 53.8ms 475: learn: 0.0370989 total: 1.02s remaining: 51.7ms 476: learn: 0.0369066 total: 1.03s remaining: 49.5ms 477: learn: 0.0367730 total: 1.03s remaining: 47.3ms 478: learn: 0.0366318 total: 1.03s remaining: 45.2ms 479: learn: 0.0364936 total: 1.03s remaining: 43ms 480: learn: 0.0363073 total: 1.03s remaining: 40.9ms 481: learn: 0.0361262 total: 1.04s remaining: 38.7ms 482: learn: 0.0359913 total: 1.04s remaining: 36.6ms 483: learn: 0.0358801 total: 1.04s remaining: 34.4ms 484: learn: 0.0357930 total: 1.04s remaining: 32.3ms 485: learn: 0.0356674 total: 1.04s remaining: 30.1ms 486: learn: 0.0355692 total: 1.05s remaining: 28ms 487: learn: 0.0354164 total: 1.05s remaining: 25.8ms 488: learn: 0.0352671 total: 1.05s remaining: 23.7ms 489: learn: 0.0351130 total: 1.05s remaining: 21.5ms 490: learn: 0.0349866 total: 1.05s remaining: 19.3ms 491: learn: 0.0348900 total: 1.06s remaining: 17.2ms 492: learn: 0.0347980 total: 1.06s remaining: 15ms 493: learn: 0.0347149 total: 1.06s remaining: 12.9ms 494: learn: 0.0345927 total: 1.06s remaining: 10.7ms 495: learn: 0.0344583 total: 1.06s remaining: 8.59ms 496: learn: 0.0343711 total: 1.07s remaining: 6.44ms 497: learn: 0.0342721 total: 1.07s remaining: 4.29ms 498: learn: 0.0341411 total: 1.07s remaining: 2.15ms 499: learn: 0.0340330 total: 1.07s remaining: 0us 模型训练集Precision:1.0,Recall:1.0,F1_Score:1.0,Accuracy:1.0 模型测试集Precision:0.9111111111111111,Recall:0.9111111111111111,F1_score:0.9111111111111111,Accuracy:0.9166666666666666 -------------------测试集混淆举证-------------------
#计算PR曲线和ROC曲线值
CatBoost_PR,CatBoost_Accuracy=PR_Curve(Model=CatBoost,Data=MLRunData,Label=ReMLLabel)
CatBoost_ROC,CatBoost_AUC=ROC_Curve(Model=CatBoost,Data=MLRunData,Label=ReMLLabel)
#输出PR曲线和ROC曲线结果
CatBoost_PR.to_excel('/mnt/workspace/Analysis Data/ML PR Data/CatBoost_PR.xlsx','UTF-8')
CatBoost_ROC.to_excel('/mnt/workspace/Analysis Data/ML ROC Data/CatBoost_ROC.xlsx','UTF-8')
#绘制PR曲线
import matplotlib.pyplot as plt
PR_curve=plt.figure(dpi=300)
PR_ax=PR_curve.add_subplot(111)
PR_ax.set_title('Precision-Recall curve')
PR_ax.plot(CatBoost_PR['recall'], CatBoost_PR['precision'],color='red',label='CatBoost=%f'%CatBoost_Accuracy)
PR_ax.plot(DTC_PR['recall'], DTC_PR['precision'],color='green',label='DecisionTree=%f'%DTC_Accuracy)
PR_ax.plot(LightGBM_PR['recall'], LightGBM_PR['precision'],color='blue',label='LightGBM=%f'%LightGBM_Accuracy)
PR_ax.plot(Logistic_PR['recall'], Logistic_PR['precision'],color='yellow',label='Logistic=%f'%Logistic_Accuracy)
PR_ax.plot(RondomForest_PR['recall'], RondomForest_PR['precision'],color='purple',label='RondomForest=%f'%RondomForest_Accuracy)
PR_ax.plot(SVM_PR['recall'], SVM_PR['precision'],color='cyan',label='SVM=%f'%SVM_Accuracy)
PR_ax.plot(XGBoost_PR['recall'], XGBoost_PR['precision'],color='pink',label='XGBoost=%f'%XGBoost_Accuracy)
PR_ax.plot([0,1],[1,0],linestyle='-.',color='black')
PR_ax.set_xlabel('Recall')
PR_ax.set_ylabel('Precision')
plt.legend(loc="best")
plt.show()
#绘制ROC曲线
import matplotlib.pyplot as plt
ROC_curve=plt.figure(dpi=300)
ROC_ax=ROC_curve.add_subplot(111)
ROC_ax.set_title('ROC Curve')
ROC_ax.plot(CatBoost_ROC['fpr'], CatBoost_ROC['tpr'],color='red',label='CatBoost=%f'%CatBoost_AUC)
ROC_ax.plot(DTC_ROC['fpr'], DTC_ROC['tpr'],color='green',label='DecisionTree=%f'%DTC_AUC)
ROC_ax.plot(LightGBM_ROC['fpr'], LightGBM_ROC['tpr'],color='blue',label='LightGBM=%f'%LightGBM_AUC)
ROC_ax.plot(Logistic_ROC['fpr'], Logistic_ROC['tpr'],color='yellow',label='Logistic=%f'%Logistic_AUC)
ROC_ax.plot(RondomForest_ROC['fpr'], RondomForest_ROC['tpr'],color='purple',label='RondomForest=%f'%RondomForest_AUC)
ROC_ax.plot(SVM_ROC['fpr'], SVM_ROC['tpr'],color='cyan',label='SVM=%f'%SVM_AUC)
ROC_ax.plot(XGBoost_ROC['fpr'], XGBoost_ROC['tpr'],color='pink',label='XGBoost=%f'%XGBoost_AUC)
ROC_ax.plot([0,1],[0,1],linestyle='-.',color='black')
ROC_ax.set_xlim([-0.05, 1.0])
ROC_ax.set_ylim([0, 1.05])
ROC_ax.set_xlabel('FPR')
ROC_ax.set_ylabel('TPR')
plt.legend(loc="best")
plt.show()
#保存模型-输入[batchs,61],输出[batch,1]
import joblib
MLModel=[Logistic,SVM_Classifier,Best_DTC,Best_Forest,XGBoost,LightGBM,CatBoost]
MLModelStr=['Logistic.pkl','SVM.pkl','DecisionTree.pkl','Forest.pkl','XGBoost.pkl','LightGBM.pkl','CatBoost.pkl']
try:
for model,modelstr in zip(MLModel,MLModelStr):
joblib.dump(model,'/mnt/workspace/Analysis Model/Meachine Learning Models/'+modelstr)
print('模型保存成功!')
except:
print('模型保存异常!!!')
模型保存成功!
#执行PCA算法-计算保留95%信息时的维度
from sklearn.decomposition import PCA
PCA=PCA(n_components=0.95)
Methylation_95=PCA.fit_transform(MLRunData) #数据降维
Methylation_95.shape
(318, 22)
#构建SHAP库解析数据集
from sklearn.model_selection import train_test_split
TrainDataSHAP,TestDataSHAP,TrainLabelSHAP,TestLabelSHAP=train_test_split(MLRunData,ReMLLabel,train_size=0.7,random_state=2025)
#载入SHAP库计算SHAP值
import shap
SVMExplainer=shap.KernelExplainer(SVM_Classifier.predict, TestDataSHAP)
shap_values=SVMExplainer.shap_values(TestDataSHAP)
100%|██████████| 96/96 [06:58<00:00, 4.36s/it]
#绘制Summaryplot图
shap.summary_plot(shap_values,TestDataSHAP,max_display=22)
shap.summary_plot(shap_values,TestDataSHAP,max_display=22,plot_type='bar')
#绘制SHAP图
SHAP_ResFormer=(TestDataSHAP)
100%|██████████| 96/96 [07:01<00:00, 4.39s/it]
shap.plots.heatmap(SHAP_SVM,max_display=23)
<Axes: xlabel='Instances'>
#绘制Decision图
expected_value=SVMExplainer.expected_value
shap.decision_plot(expected_value,shap_values,TestDataSHAP.columns,feature_display_range=slice(None, -23, -1))
#绘制SHAP散点图
shap.plots.scatter(SHAP_SVM[:,'cg00581848'])
shap.plots.scatter(SHAP_SVM[:,'cg00581848'],color=SHAP_SVM)
#绘制SHAP散点图-性别特异性
shap.plots.scatter(SHAP_SVM[:,'GenderEncoder'])
shap.plots.scatter(SHAP_SVM[:,'GenderEncoder'],color=SHAP_SVM)
#输出SHAP值
SHAPData=DataFrame(shap_values)
SHAPData.to_excel('/mnt/workspace/Analysis Data/SVM_SHAP_Data.xlsx','UFT-8')
SHAPData.to_csv('/mnt/workspace/Analysis Data/SVM_SHAP_Data.csv')
使用LIME(Local Interpretable Model-agnostic Explanations)对SVM模型进行模型解释性。
#转化为Numpy数组
import numpy as np
TestDataLIME=np.array(TestDataSHAP)
TestDataLIME
#构建LIME解释器
from lime.lime_tabular import LimeTabularExplainer
SVM_LIMEExplainer=LimeTabularExplainer(training_data=TestDataLIME,mode='classification',feature_names=TestDataSHAP.columns,class_names=[0,1], discretize_continuous=True)
#提取类风湿性关节炎患者和健康人数据——为数据生成解释
LIMEData=pd.concat([TestDataSHAP,TestLabelSHAP],axis=1) #合并数据
Contral_Data=LIMEData.loc[LIMEData.loc[:,'DiseaseEncoder']==0,:] #正常组数据
Contral_DataSample=Contral_Data.iloc[0,:-1]
RA_Data=LIMEData.loc[LIMEData.loc[:,'DiseaseEncoder']==1,:] #正常组数据
RA_DataSample=RA_Data.iloc[0,:-1]
#为健康人生成可解释性
Contral_Explain=SVM_LIMEExplainer.explain_instance(Contral_DataSample,SVM_Classifier.predict_proba,num_features=22)
#可视化解释结果
Contral_Explain.show_in_notebook(show_table=True, show_all=False)
Contral_Explain.as_pyplot_figure()
/usr/local/lib/python3.11/dist-packages/lime/discretize.py:110: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` ret[feature] = int(self.lambdas[feature](ret[feature])) /usr/local/lib/python3.11/dist-packages/lime/discretize.py:110: FutureWarning: Series.__setitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To set a value by position, use `ser.iloc[pos] = value` ret[feature] = int(self.lambdas[feature](ret[feature])) /usr/local/lib/python3.11/dist-packages/lime/lime_tabular.py:544: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` binary_column = (inverse_column == first_row[column]).astype(int) /usr/local/lib/python3.11/dist-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but SVC was fitted with feature names warnings.warn( /usr/local/lib/python3.11/dist-packages/lime/discretize.py:110: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` ret[feature] = int(self.lambdas[feature](ret[feature])) /usr/local/lib/python3.11/dist-packages/lime/discretize.py:110: FutureWarning: Series.__setitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To set a value by position, use `ser.iloc[pos] = value` ret[feature] = int(self.lambdas[feature](ret[feature])) /usr/local/lib/python3.11/dist-packages/lime/lime_tabular.py:427: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` discretized_instance[f])]
#为类风湿性关节炎患者生成可解释性
RA_Explain=SVM_LIMEExplainer.explain_instance(RA_DataSample,SVM_Classifier.predict_proba,num_features=22)
#可视化解释结果
RA_Explain.show_in_notebook(show_table=True, show_all=False)
RA_Explain.as_pyplot_figure()
/usr/local/lib/python3.11/dist-packages/lime/discretize.py:110: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` ret[feature] = int(self.lambdas[feature](ret[feature])) /usr/local/lib/python3.11/dist-packages/lime/discretize.py:110: FutureWarning: Series.__setitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To set a value by position, use `ser.iloc[pos] = value` ret[feature] = int(self.lambdas[feature](ret[feature])) /usr/local/lib/python3.11/dist-packages/lime/lime_tabular.py:544: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` binary_column = (inverse_column == first_row[column]).astype(int) /usr/local/lib/python3.11/dist-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but SVC was fitted with feature names warnings.warn( /usr/local/lib/python3.11/dist-packages/lime/discretize.py:110: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` ret[feature] = int(self.lambdas[feature](ret[feature])) /usr/local/lib/python3.11/dist-packages/lime/discretize.py:110: FutureWarning: Series.__setitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To set a value by position, use `ser.iloc[pos] = value` ret[feature] = int(self.lambdas[feature](ret[feature])) /usr/local/lib/python3.11/dist-packages/lime/lime_tabular.py:427: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` discretized_instance[f])]
提取SHAP方法和LIME方法提取的特征数据做交集获得候选特征。
#获取交 集数据
SHAP_Feature=['cg00581848','cg05544622','cg00478198','cg01938887','cg17482649','cg00455876','cg11704979','cg01515508',
'cg05257372','cg22221554','cg00423014','cg22561883','cg10315562','cg26012731','cg03601619','cg26039926',
'cg02714462','cg08141049','cg05443523','cg19532714','age','cg07066594'] #SHAP框架提取特征
LIME_Feature=['cg07066594','cg00581848','cg17482649','cg00455876','cg05544622','cg00423014','cg01515508','cg05443523',
'cg08141049','cg26039926','cg00342358','cg23925558','cg16078210','cg00306390','cg00776430','cg01938887',
'cg00543485','cg22561883','cg02896361','cg17488844','cg05257372','cg12944030'] #LIME提取特征
MethylationRA=set(SHAP_Feature)&set(LIME_Feature) #计算交集
MethylationRA=list(MethylationRA)+['age','gender','disease','GenderEncoder','DiseaseEncoder']
MethylationRA
['cg05544622', 'cg01938887', 'cg07066594', 'cg22561883', 'cg00581848', 'cg05443523', 'cg05257372', 'cg00455876', 'cg17482649', 'cg26039926', 'cg00423014', 'cg01515508', 'cg08141049', 'age', 'gender', 'disease', 'GenderEncoder', 'DiseaseEncoder']
对交集DNA甲基化位点数据进行统计学分析,主要包括:正态检验、差异分析和相关分析。
#获取统计学数据
StatsData=MethylationData.loc[:,MethylationRA]
StatsData.shape
(318, 18)
#首先进行正态性分析
from scipy.stats import normaltest
CGList=['cg05544622','cg01938887','cg07066594','cg22561883','cg00581848','cg05443523','cg05257372','cg00455876',
'cg17482649','cg26039926','cg00423014','cg01515508','cg08141049','age']
NormalCG,NormalStats,NormalPvalue=[],[],[]
for cg in CGList:
CGData=StatsData.loc[:,cg] #提取数据
normalstats,normalpvalue=normaltest(CGData) #正态检验
NormalCG.append(cg)
NormalStats.append(normalstats)
NormalPvalue.append(normalpvalue)
NormalResult=DataFrame()
NormalResult['Feature']=NormalCG
NormalResult['Normal Stats']=NormalStats
NormalResult['Normal P value']=NormalPvalue
NormalResult=NormalResult.sort_values('Normal P value',ascending=True)
NormalResult
| Feature | Normal Stats | Normal P value | |
|---|---|---|---|
| 3 | cg22561883 | 45397.441686 | 0.000000e+00 |
| 0 | cg05544622 | 748.223251 | 3.352629e-163 |
| 12 | cg08141049 | 388.753626 | 3.830210e-85 |
| 8 | cg17482649 | 234.045003 | 1.505821e-51 |
| 9 | cg26039926 | 230.466573 | 9.011967e-51 |
| 7 | cg00455876 | 204.595818 | 3.737512e-45 |
| 5 | cg05443523 | 169.865790 | 1.300506e-37 |
| 11 | cg01515508 | 153.367553 | 4.973465e-34 |
| 10 | cg00423014 | 148.633679 | 5.304039e-33 |
| 2 | cg07066594 | 122.553661 | 2.442366e-27 |
| 6 | cg05257372 | 87.312124 | 1.097522e-19 |
| 4 | cg00581848 | 76.637329 | 2.282525e-17 |
| 1 | cg01938887 | 8.991284 | 1.115751e-02 |
| 13 | age | 4.452384 | 1.079387e-01 |
#输出正态检验结果
NormalResult.to_excel('/mnt/workspace/Analysis Data/Stats Results/NormalResult.xlsx','UTF-8')
NormalResult.to_csv('/mnt/workspace/Analysis Data/Stats Results/NormalResult.csv')
#查看数据分布
import seaborn as sns
for cg in CGList:
kde=plt.figure()
sns.distplot(StatsData.loc[:,cg])
plt.show()
#以是否发生类风湿性关节炎分组进行差异分析
#使用Mann-Whitney U和Kruskal-Wallis H进行差异分析
from scipy.stats import mannwhitneyu,kruskal
Diff_name,Mann_W_stats,Mann_W_pvalue=[],[],[]
KW_Stats,KW_Pvalue=[],[]
RA=StatsData.loc[StatsData.loc[:,'disease']=='rheumatoid arthritis',:]
Control=StatsData.loc[StatsData.loc[:,'disease']=='control',:]
for name in CGList:
mann_stats,mann_pvalue=mannwhitneyu(RA.loc[:,name],Control.loc[:,name],alternative='two-sided')
Diff_name.append(name)
Mann_W_stats.append(mann_stats)
Mann_W_pvalue.append(mann_pvalue)
kw_stats,kw_pvalue=kruskal(RA.loc[:,name],Control.loc[:,name])
KW_Stats.append(kw_stats)
KW_Pvalue.append(kw_pvalue)
DiffResult=DataFrame()
DiffResult['CG']=Diff_name
DiffResult['Mann-Whitney U Stats']=Mann_W_stats
DiffResult['Mann-Whitney U Pvalue']=Mann_W_pvalue
DiffResult['Kruskal-Wallis H Stat']=KW_Stats
DiffResult['Kruskal-Wallis H Pvalue']=KW_Pvalue
DiffResult=DiffResult.sort_values('Mann-Whitney U Pvalue',ascending=True)
DiffResult
| CG | Mann-Whitney U Stats | Mann-Whitney U Pvalue | Kruskal-Wallis H Stat | Kruskal-Wallis H Pvalue | |
|---|---|---|---|---|---|
| 12 | cg08141049 | 18348.5 | 1.423253e-18 | 77.375583 | 1.413519e-18 |
| 9 | cg26039926 | 5909.5 | 2.189873e-16 | 67.433837 | 2.178774e-16 |
| 6 | cg05257372 | 6207.5 | 3.556541e-15 | 61.940912 | 3.539194e-15 |
| 7 | cg00455876 | 8289.5 | 2.437285e-14 | 58.156458 | 2.420782e-14 |
| 4 | cg00581848 | 16820.0 | 6.371911e-13 | 51.741242 | 6.331863e-13 |
| 8 | cg17482649 | 9202.0 | 1.038787e-09 | 37.261518 | 1.033031e-09 |
| 10 | cg00423014 | 7932.5 | 2.694328e-09 | 35.400669 | 2.683947e-09 |
| 11 | cg01515508 | 17054.5 | 7.118968e-08 | 29.038656 | 7.094835e-08 |
| 5 | cg05443523 | 15647.0 | 4.901541e-07 | 25.310599 | 4.880197e-07 |
| 2 | cg07066594 | 16273.5 | 3.926378e-06 | 21.306322 | 3.914385e-06 |
| 0 | cg05544622 | 9963.0 | 1.647483e-04 | 14.200984 | 1.642846e-04 |
| 1 | cg01938887 | 11455.0 | 1.482944e-01 | 2.091472 | 1.481233e-01 |
| 3 | cg22561883 | 11780.5 | 2.944031e-01 | 1.100655 | 2.941223e-01 |
| 13 | age | 13486.5 | 3.022940e-01 | 1.065306 | 3.020081e-01 |
#输出差异分析结果
DiffResult.to_excel('/mnt/workspace/Analysis Data/Stats Results/DiffResult.xlsx','UTF-8')
DiffResult.to_csv('/mnt/workspace/Analysis Data/Stats Results/DiffResult.csv')
StatsData
| cg05544622 | cg01938887 | cg07066594 | cg22561883 | cg00581848 | cg05443523 | cg05257372 | cg00455876 | cg17482649 | cg26039926 | cg00423014 | cg01515508 | cg08141049 | age | gender | disease | GenderEncoder | DiseaseEncoder | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.000000 | -1.271175 | -1.348848 | -0.176421 | -2.616463 | -2.441207 | 3.938986 | 0.000000 | 0.000000 | -2.749976 | 1.202370 | 3.701353 | -0.866223 | 19.0000 | F | control | 1.0 | 0 |
| 1 | 0.000000 | -1.130596 | -1.495729 | 0.347383 | -2.468664 | -1.536363 | 3.938986 | 0.000000 | 0.900040 | -3.052934 | 1.019899 | 3.543689 | -1.265340 | 12.0000 | F | control | 1.0 | 0 |
| 2 | 0.000000 | -1.168907 | -0.553611 | -0.196592 | 0.000000 | -1.071863 | 3.580953 | 0.000000 | 0.856647 | -3.228904 | 0.241113 | 1.557086 | -0.575243 | 71.8192 | F | control | 1.0 | 0 |
| 3 | 4.178048 | -2.921730 | -3.201956 | -3.701353 | -2.732410 | 0.000000 | 3.472874 | 0.510719 | 0.000000 | -2.664729 | 1.839820 | 4.178048 | 0.000000 | 23.0000 | M | control | 0.0 | 0 |
| 4 | 0.000000 | -1.456115 | -2.131436 | 0.011998 | 0.000000 | 0.000000 | 3.938986 | 0.000000 | 0.372163 | -2.131436 | 0.837604 | 3.938986 | -1.635580 | 84.0000 | F | control | 1.0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 313 | 0.000000 | -1.798190 | -1.236447 | -0.019997 | 0.000000 | 0.000000 | 3.580953 | 0.000000 | 0.000000 | -2.963760 | 0.000000 | 4.402578 | 0.000000 | 34.0000 | F | rheumatoid arthritis | 1.0 | 1 |
| 314 | 3.659672 | -4.112908 | 0.000000 | -4.247583 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -2.253116 | 0.000000 | 3.993781 | 0.000000 | 51.0000 | M | rheumatoid arthritis | 0.0 | 1 |
| 315 | 0.000000 | -2.163468 | 0.000000 | -0.180452 | 0.000000 | 0.000000 | 3.343970 | 0.000000 | 0.000000 | -3.343970 | 1.130596 | 3.472874 | 0.000000 | 45.0000 | F | rheumatoid arthritis | 1.0 | 1 |
| 316 | 0.000000 | -1.578093 | 0.000000 | -0.088039 | 0.000000 | 0.000000 | 3.256540 | 0.000000 | 0.000000 | -2.767818 | 0.000000 | 4.178048 | 0.000000 | 31.0000 | F | rheumatoid arthritis | 1.0 | 1 |
| 317 | 0.000000 | -1.141464 | -0.895180 | -0.224898 | 0.000000 | 0.000000 | 3.149987 | 0.000000 | 0.000000 | -3.076403 | 1.672671 | 4.940737 | 0.000000 | 58.0000 | F | rheumatoid arthritis | 1.0 | 1 |
318 rows × 18 columns
#绘制特异性数据箱型图
for names in CGList:
plt.figure(dpi=300)
sns.boxplot(StatsData,x='disease',y=names,showmeans=True)
plt.show()
#以是类风湿性关节炎性别分组进行差异分析
#使用Mann-Whitney U和Kruskal-Wallis H进行差异分析
from scipy.stats import mannwhitneyu,kruskal
SexDiff_name,SexMann_W_stats,SexMann_W_pvalue=[],[],[]
SexKW_Stats,SexKW_Pvalue=[],[]
Man=RA.loc[RA.loc[:,'gender']=='M',:]
Feman=RA.loc[RA.loc[:,'gender']=='F',:]
for name in CGList:
mann_stats,mann_pvalue=mannwhitneyu(Man.loc[:,name],Feman.loc[:,name],alternative='two-sided')
SexDiff_name.append(name)
SexMann_W_stats.append(mann_stats)
SexMann_W_pvalue.append(mann_pvalue)
kw_stats,kw_pvalue=kruskal(Man.loc[:,name],Feman.loc[:,name])
SexKW_Stats.append(kw_stats)
SexKW_Pvalue.append(kw_pvalue)
SexDiffResult=DataFrame()
SexDiffResult['CG']=SexDiff_name
SexDiffResult['Mann-Whitney U Stats']=SexMann_W_stats
SexDiffResult['Mann-Whitney U Pvalue']=SexMann_W_pvalue
SexDiffResult['Kruskal-Wallis H Stat']=SexKW_Stats
SexDiffResult['Kruskal-Wallis H Pvalue']=SexKW_Pvalue
SexDiffResult=DiffResult.sort_values('Mann-Whitney U Pvalue',ascending=True)
SexDiffResult
| CG | Mann-Whitney U Stats | Mann-Whitney U Pvalue | Kruskal-Wallis H Stat | Kruskal-Wallis H Pvalue | |
|---|---|---|---|---|---|
| 12 | cg08141049 | 18348.5 | 1.423253e-18 | 77.375583 | 1.413519e-18 |
| 9 | cg26039926 | 5909.5 | 2.189873e-16 | 67.433837 | 2.178774e-16 |
| 6 | cg05257372 | 6207.5 | 3.556541e-15 | 61.940912 | 3.539194e-15 |
| 7 | cg00455876 | 8289.5 | 2.437285e-14 | 58.156458 | 2.420782e-14 |
| 4 | cg00581848 | 16820.0 | 6.371911e-13 | 51.741242 | 6.331863e-13 |
| 8 | cg17482649 | 9202.0 | 1.038787e-09 | 37.261518 | 1.033031e-09 |
| 10 | cg00423014 | 7932.5 | 2.694328e-09 | 35.400669 | 2.683947e-09 |
| 11 | cg01515508 | 17054.5 | 7.118968e-08 | 29.038656 | 7.094835e-08 |
| 5 | cg05443523 | 15647.0 | 4.901541e-07 | 25.310599 | 4.880197e-07 |
| 2 | cg07066594 | 16273.5 | 3.926378e-06 | 21.306322 | 3.914385e-06 |
| 0 | cg05544622 | 9963.0 | 1.647483e-04 | 14.200984 | 1.642846e-04 |
| 1 | cg01938887 | 11455.0 | 1.482944e-01 | 2.091472 | 1.481233e-01 |
| 3 | cg22561883 | 11780.5 | 2.944031e-01 | 1.100655 | 2.941223e-01 |
| 13 | age | 13486.5 | 3.022940e-01 | 1.065306 | 3.020081e-01 |
#输出差异分析结果
SexDiffResult.to_excel('/mnt/workspace/Analysis Data/Stats Results/SexDiffResult.xlsx','UTF-8')
SexDiffResult.to_csv('/mnt/workspace/Analysis Data/Stats Results/SexDiffResult.csv')
#绘制特异性数据箱型图
for names in CGList:
plt.figure(dpi=300)
sns.boxplot(StatsData,x='gender',y=names,showmeans=True)
plt.show()
#绘制箱型图
for cg_name in CGList:
box=plt.figure(dpi=300)
sns.boxplot(StatsData,x='disease',y=cg_name,hue='gender')
plt.show()
计算相关性:年龄及其性别数据(cg07066594)的相关性。
#计算相关性
Spearman=StatsData.loc[:,CGList].corr(method='spearman')
plt.figure(dpi=300)
sns.clustermap(Spearman,annot=True)
plt.show()
<Figure size 1920x1440 with 0 Axes>
#绘制散点图查看数据分布
Scatter=plt.figure(dpi=300)
StatterAx=Scatter.add_subplot()
StatterAx.set_title('Scatterplot of methylation data')
scatter_plot=StatterAx.scatter(x=StatsData.loc[:,'cg00581848'],y=StatsData.loc[:,'cg05544622'],c=StatsData.loc[:,'cg07066594'],cmap='jet')
StatterAx.set_xlabel('cg00581848')
StatterAx.set_ylabel('cg05544622')
Scatter.colorbar(scatter_plot)
plt.show()
#绘制散点图查看数据分布
Scatter=plt.figure(dpi=300)
StatterAx=Scatter.add_subplot()
StatterAx.set_title('Scatterplot of methylation data')
scatter_plot=StatterAx.scatter(x=StatsData.loc[:,'cg00581848'],y=StatsData.loc[:,'cg05544622'],c=StatsData.loc[:,'age'],cmap='jet')
StatterAx.set_xlabel('cg00581848')
StatterAx.set_ylabel('cg05544622')
Scatter.colorbar(scatter_plot)
plt.show()
#输出统计学结果
StatsData.to_excel('/mnt/workspace/DNA methylation data/RA DNA methylation/StatsRawData.xlsx','UTF-8')
StatsData.to_csv('/mnt/workspace/DNA methylation data/RA DNA methylation/StatsRawData.csv')